In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd
from nltk.corpus import stopwords

In [52]:
dem = pd.read_parquet("10_datasets/democrats.parquet", engine='fastparquet')
rep = pd.read_parquet("10_datasets/republican.parquet", engine='fastparquet')
neut = pd.read_parquet("10_datasets/neutral.parquet", engine='fastparquet')

In [53]:
test = pd.concat([dem, rep, neut])

In [55]:
test.subreddit.value_counts()

democrats          4937
Republican         4809
NeutralPolitics    4760
Name: subreddit, dtype: int64

In [36]:
dem['label_type']="democrat"
rep['label_type']="republican"
neut['label_type']="neutral"

In [37]:
data=dem.append(rep)
data=data.append(neut)
data['label_type'].value_counts()

democrat      4937
republican    4809
neutral       4760
Name: label_type, dtype: int64

In [38]:
train_data = data.sample(frac=0.8,random_state=3320)
test_data = data.drop(train_data.index)

In [51]:
train_data.shape

(11605, 8)

In [50]:
test_data.shape

(1541, 8)

In [39]:
train_data.head()

Unnamed: 0_level_0,id,total_post,subreddit,score,type,title,text,label_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
341,ggjryy4,"Really it comes down to who has the guns, and ...",NeutralPolitics,1.0,comment,,,neutral
58,ggjjtu4,It doesn't make sense. There's absolutely no ...,Republican,5.0,comment,,,republican
2106,gd2ln00,My favorite president. Did incredible things f...,Republican,2.0,comment,,,republican
5132,ga0482e,"You don't have to tell me about ""Manufacturing...",NeutralPolitics,1.0,comment,,,neutral
235,j9b5fd,One of the main selling points for republicans...,Republican,1.0,post,One of the main selling points for republicans...,[removed],republican


In [40]:
train_x = train_data["total_post"]
train_y = train_data["label_type"]
test_x = test_data["total_post"]
test_y = test_data["label_type"]


In [41]:
tf_vectorizer = CountVectorizer(strip_accents="unicode", lowercase = True, stop_words="english") 
X_train_tf = tf_vectorizer.fit_transform(train_x)


In [42]:
X_train_tf

<11605x21533 sparse matrix of type '<class 'numpy.int64'>'
	with 252849 stored elements in Compressed Sparse Row format>

In [43]:
X_test_tf = tf_vectorizer.transform(test_x)




In [44]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)

MultinomialNB()

In [45]:
y_pred = naive_bayes_classifier.predict(X_test_tf)
score1 = metrics.accuracy_score(test_y, y_pred)


In [46]:
score1

0.6703439325113563

In [47]:
metrics.confusion_matrix(test_y, y_pred)

array([[211,  58, 113],
       [ 84, 483,  86],
       [102,  65, 339]])

In [48]:
y_pred

array(['democrat', 'democrat', 'democrat', ..., 'neutral', 'democrat',
       'democrat'], dtype='<U10')

In [49]:
train_data.reset_index()

Unnamed: 0,index,id,total_post,subreddit,score,type,title,text,label_type
0,341,ggjryy4,"Really it comes down to who has the guns, and ...",NeutralPolitics,1.0,comment,,,neutral
1,58,ggjjtu4,It doesn't make sense. There's absolutely no ...,Republican,5.0,comment,,,republican
2,2106,gd2ln00,My favorite president. Did incredible things f...,Republican,2.0,comment,,,republican
3,5132,ga0482e,"You don't have to tell me about ""Manufacturing...",NeutralPolitics,1.0,comment,,,neutral
4,235,j9b5fd,One of the main selling points for republicans...,Republican,1.0,post,One of the main selling points for republicans...,[removed],republican
...,...,...,...,...,...,...,...,...,...
11600,4513,g5j2dl0,How exactly? When people don't comply and put ...,Republican,3.0,comment,,,republican
11601,23926,fkrk37j,Since I'm in the US and don't really do much w...,NeutralPolitics,1.0,comment,,,neutral
11602,8125,fyue5eo,Don't forget she also claimed you needed to as...,Republican,7.0,comment,,,republican
11603,10632,g4qssr3,No it just means the pope gave him communion. ...,NeutralPolitics,1.0,comment,,,neutral
