In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import _stop_words
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn import set_config
set_config(display='diagram')

In [2]:
# tweet classification - Trudeau vs Trump

In [3]:
# load the data
url = url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/twitter.csv'
data = pd.read_csv(url)
data.head()

Unnamed: 0,timestamp,text,user
0,2020-03-02 23:06:03,"WOW! Thank you, just landed, see everyone soon...",realDonaldTrump
1,2020-03-02 21:47:49,Departing for the Great State of North Carolin...,realDonaldTrump
2,2020-03-02 21:32:54,They are staging a coup against Bernie!,realDonaldTrump
3,2020-03-02 19:55:40,THANK YOU!https://www.breitbart.com/tech/2020/...,realDonaldTrump
4,2020-03-02 19:55:07,Michelle @FischbachMN7 is running for Congress...,realDonaldTrump


This is a corpus of tweets from Donald Trump and Justin Trudeau. 
The **goal** is to build a classification pipeline that predicts the author (Trump or Trudeau) of a tweet based on the text.

**Part 1:** Define the feature matrix X and the target vector y from the dataframe, and then split X and y into training and testing sets.

In [4]:
X=data.text
y=data.user

In [5]:
X_train, X_test,y_train,y_test=train_test_split(X,y)

**Part 2:** build a classification pipeline (count vectorizer + Naive Bayes model), and fit the pipeline to the training data.

In [6]:
pipe=Pipeline(steps=[
    ('vect',CountVectorizer (max_features=5000,ngram_range=(1,2))),
    ('clf',MultinomialNB())
])

**Part 3:** Evaluate the performance classification pipeline on the test set

In [7]:
pipe.fit(X_train,y_train)

In [8]:
y_test_pred=pipe.predict(X_test)
accuracy_score(y_test,y_test_pred)

0.9660377358490566

In [9]:
confusion_matrix(y_test,y_test_pred)

array([[ 96,   3],
       [  6, 160]], dtype=int64)

**Part 4:** What words does the model use to choose between Trump or Trudeau

In [10]:
words=pipe['vect'].get_feature_names()
pipe['clf'].classes_

array(['JustinTrudeau', 'realDonaldTrump'], dtype='<U15')

In [11]:
JustinTrudeau=pipe['clf'].feature_count_[0]
realDonaldTrump=pipe['clf'].feature_count_[1]

In [12]:
words_df=pd.DataFrame({'words':words,'JustinTrudeau':JustinTrudeau,'realDonaldTrump':realDonaldTrump}).set_index('words')
words_df

Unnamed: 0_level_0,JustinTrudeau,realDonaldTrump
words,Unnamed: 1_level_1,Unnamed: 2_level_1
00,0.0,14.0
00 eastern,0.0,7.0
00 the,0.0,2.0
000,7.0,11.0
000 americans,0.0,2.0
...,...,...
youth,2.0,1.0
youtube,2.0,0.0
youtube com,2.0,0.0
zero,2.0,2.0


In [13]:
words_df= words_df+1
words_df=words_df/words_df.sum()
words_df['JT']=words_df['JustinTrudeau']/words_df['realDonaldTrump']
words_df['DT']=words_df['realDonaldTrump']/words_df['JustinTrudeau']

## the top 10 words for Justin Trudeau
words_df.JT.sort_values(ascending=False).head(10)

words
ca          81.002892
pm          77.320942
ca en       76.400455
en          76.400455
en news     51.547295
gc ca       51.547295
gc          51.547295
pm gc       51.547295
https pm    48.785833
ll          46.024370
Name: JT, dtype: float64

In [14]:
## the top 10 words for Donald Trump
words_df.DT.sort_values(ascending=False).head(10)

words
fake news      52.146286
white house    39.109715
white          39.109715
conference     34.764191
democrats      32.591429
media          31.505048
fake           30.961857
the fake       26.073143
him            23.900381
it is          23.900381
Name: DT, dtype: float64