In [34]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
import seaborn as sns

sns.set()

In [35]:
data = pd.read_csv('dataset.csv')

In [36]:
data = data.sample(frac=1).reset_index(drop=True)

In [37]:
columns_titles = ["Tweet", "Label"]
data = data.reindex(columns=columns_titles)

In [38]:
train_data = data.head(int(225*0.8))

In [39]:
test_data = data.tail(int(225*0.2))

In [40]:
display(train_data)
display(test_data)

Unnamed: 0,Tweet,Label
0,@_10kanee @spectatorindex BREAKING: Coronaviru...,0
1,"RT @10News: Amid coronavirus outbreak, thousan...",1
2,RT @Independent: Hong Kong hospital workers st...,1
3,RT @The_World_Is_Y: #coronavirus #chinaflu #...,0
4,RT @alfonslopeztena: The political cost of the...,1
...,...,...
175,RT @business: Here's the latest on coronavirus...,0
176,RT @AJEnglish: Why anger over the Hong Kong go...,1
177,RT @guardian: Chinas response to coronavirus e...,1
178,RT @ChannelNewsAsia: Hong Kong medical workers...,0


Unnamed: 0,Tweet,Label
181,RT @YohendrisQ: @DeZurdaTeam @DiazCanelB @mart...,1
182,"RT @washingtonpost: Facing coronavirus threat,...",0
183,RT @ABC: Hundreds of Hong Kong hospital worker...,1
184,RT @nytimesworld: Top officials in Southeast A...,1
185,RT @AJEnglish: Why anger over the Hong Kong go...,1
186,RT @FOCUS_TopNews: China hat nach internationa...,1
187,"""RT @QuickTake: """"If the government does not s...",0
188,RT @Erik32480523: @VictoriaALC @DeZurdaTeam @D...,1
189,"""RT @NPR: One doctor in Hong Kong says he unde...",1
190,RT @vilmadiazcuba: @DeZurdaTeam @DiazCanelB @m...,0


In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [42]:
X = np.array(data['Tweet'])
y = np.array(data['Label'])
X,y

s in the UK to eigh',
        'RT @HawleyMO: #China thinks efforts to stop #coronavirus have gone too far? While infections skyrocket and the world is at risk?  https://t',
        'RT @alfonslopeztena: Hundreds of Hong Kong medical workers strike to demand the city close its border with China to reduce the coronavirus',
        'RT @ABC: Hundreds of Hong Kong hospital workers went on strike, demanding the government shutter all borders with mainland China as the cou',
        "RT @SkyNews: As Hong Kong faces #coronavirus, its citizens are furious that the city's borders are still open. Medics are threatening to st",
        'RT @g_nishiyama:  # https://t.co/Kr8C7TyX8F',
        'RT @alfonslopeztena: Hundreds of Hong Kong medical workers strike to demand the city close its border with China to reduce the coronavirus',
        'RT @AlmaDelaSierra1: @DeZurdaTeam @DiazCanelB @marti160patria @SecUJCuba @FeuCuba @UJCuba @gutierrez_onel @rodulfohumberto @neljulger @_dbl',
        '@DeZurdaTe

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

In [66]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(X_train)

In [67]:
classifier = MultinomialNB()
targets = y_train
classifier.fit(counts,targets)

MultinomialNB()

In [70]:
selector = SelectKBest(chi2, k=800)
selector.fit(counts, targets)

SelectKBest(k=800, score_func=<function chi2 at 0x7fcde6e6d680>)

In [83]:
test_counts = vectorizer.transform(list(X_test))
y_pred = classifier.predict_proba(test_counts)
np.round(y_pred,decimals=3)

array([[0.   , 1.   ],
       [0.91 , 0.09 ],
       [0.723, 0.277],
       [0.   , 1.   ],
       [1.   , 0.   ],
       [0.   , 1.   ],
       [0.   , 1.   ],
       [0.997, 0.003],
       [0.   , 1.   ],
       [0.   , 1.   ],
       [0.   , 1.   ],
       [0.   , 1.   ],
       [0.999, 0.001],
       [0.   , 1.   ],
       [0.   , 1.   ],
       [0.996, 0.004],
       [0.998, 0.002],
       [1.   , 0.   ],
       [0.   , 1.   ],
       [0.006, 0.994],
       [0.036, 0.964],
       [0.828, 0.172],
       [0.   , 1.   ],
       [0.   , 1.   ],
       [0.651, 0.349],
       [0.992, 0.008],
       [0.984, 0.016],
       [0.   , 1.   ],
       [0.84 , 0.16 ],
       [0.007, 0.993],
       [0.   , 1.   ],
       [0.723, 0.277],
       [0.036, 0.964],
       [1.   , 0.   ],
       [0.999, 0.001],
       [0.868, 0.132],
       [0.036, 0.964],
       [0.   , 1.   ],
       [0.   , 1.   ],
       [0.977, 0.023],
       [0.985, 0.015],
       [0.993, 0.007],
       [0.045, 0.955],
       [0.0

In [72]:
y_test

array([1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0])

In [76]:
avg = 0
for i in y_pred==y_test:
    if i==True:
        avg += 1
avg = avg/len(y_pred==y_test)
avg

0.7391304347826086

In [86]:
dump(vectorizer, 'models/vectorizer.joblib')

['models/vectorizer.joblib']

In [87]:
dump(selector, 'models/selector.joblib')

['models/selector.joblib']

In [88]:
dump(classifier, 'models/classifier.joblib')

['models/classifier.joblib']