In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\Sajin.LAPTOP-
[nltk_data]     RE0DL8PH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
from twitter_clean import clean_text
train_data['clean_text']=train_data['text'].apply(clean_text)
test_data['clean_text']=test_data['text'].apply(clean_text)
train_data.to_csv('train_clean.csv',index=False)
test_data.to_csv('test_clean.csv',index=False)

In [4]:
stop_words = set(stopwords.words('english'))
wn = nltk.WordNetLemmatizer()
def preprocessing(text):
    tokens = [wn.lemmatize(w) for w in word_tokenize(text.lower().strip()) if not w in stop_words]
    return ' '.join(tokens)
train_data['preprocessed_text']=train_data['clean_text'].apply(preprocessing)
test_data['preprocessed_text']=test_data['clean_text'].apply(preprocessing)
train_data.to_csv('train_preprocessed.csv',index=False)
test_data.to_csv('test_preprocessed.csv',index=False)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report
import matplotlib.pyplot as plt
%matplotlib inline
x_train, x_test, y_train, y_test = train_test_split(train_data['preprocessed_text'],train_data['target'],stratify=train_data['target'],test_size=0.2,random_state=2)
count_vect = CountVectorizer()
tf_transformer = TfidfTransformer(use_idf=False) # just use tf, no idf used

# convert the text list to tfidf form matrix
x_train_counts = count_vect.fit_transform(x_train)
x_train_tf = tf_transformer.fit_transform(x_train_counts)
y_train = np.array(y_train)

clf = BernoulliNB(alpha=1.11, fit_prior=True, class_prior=None)#1.11
clf.fit(x_train_tf, y_train) # train the classifier

# convert list to matrix
x_pre_counts = count_vect.transform(x_test)
x_pre_tf = tf_transformer.transform(x_pre_counts)

predicted = clf.predict(x_pre_tf)

In [6]:
metrics.accuracy_score(list(y_test), predicted)

0.8017071569271176

In [7]:
print(classification_report(list(y_test), predicted))

              precision    recall  f1-score   support

           0       0.78      0.91      0.84       869
           1       0.85      0.65      0.74       654

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.79      1523
weighted avg       0.81      0.80      0.80      1523



In [8]:
for item in count_vect.get_feature_names_out():
    print(item)

00
000
0000
01
02
0215
02pm
03
030
034
04
05
05th
06
06jst
07
08
0840728
0853
087809233445
0880
09
0a
0l
0npzp
10
100
1000
10000
100000
1008planet_
100bn
100mb
101
1017
1023
103
105
106
107
109
10am
10news
10th
10w
10x
11
11000
11000000
111020
114
1141
115
119000
12
1200
12000
1200000
1236
124
125
129
12am
12hr
12jst
12m
12mm
12news
12th
12u
12v
13
130
13000
133
139055
14
140
14028
141
148
149
14th
15
150
1500
15000270653
158
159
15901
15p
15pm
15t
15th
16
160
1600
1620
163
166
1665
17
1700
17000
171
1716
1717
179
17months
17th
18
180
1800
184
1843
1880
1890
1895
18jst
18th
18w
19
1916
1921
1935
1940
1942
1943
1945
1950
1952
1960s
1965
1967
1970
1970s
1974
1976
1979
198
1980
1984
1986
1994
1998
1999
19th
1a
1d
1m
1pack
1st
1x1
20
200
2000
200000
20000k
2003
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
20150613
2016
2017
20177
2019
202
2030
2065
208
20k
20mins
21
214904
21k
22
22days
23
2327564d
238
239
23928835
23rd
24
241487
24v
25
250
250000
250k
26
263789f4
263chat
265v
26

In [9]:
x_test_counts = count_vect.transform(test_data['preprocessed_text'])
x_test_tf = tf_transformer.transform(x_test_counts)
predicted = clf.predict(x_test_tf)

In [10]:
tem=pd.read_csv('sample_submission.csv')
tem['target']=predicted
tem.to_csv('test_prediction.csv',index=False)

In [11]:
perfect_score=pd.read_csv('submission.csv')
metrics.accuracy_score(list(perfect_score['target']), predicted)

0.7931351517008888

In [12]:
print(classification_report(list(perfect_score['target']), predicted))

              precision    recall  f1-score   support

           0       0.76      0.92      0.84      1861
           1       0.86      0.62      0.72      1402

    accuracy                           0.79      3263
   macro avg       0.81      0.77      0.78      3263
weighted avg       0.81      0.79      0.79      3263



In [13]:
print(len(count_vect.get_feature_names_out()))

11739


In [14]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r