### Imports and Bringing in Custom Functions

In [1]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
tqdm.pandas(desc="Applying")

def train_doc2vec_model(dayfray, name_of_model):
    processed = dayfray.apply(lambda x: TaggedDocument(words=(x['doc']), tags=[x['label']]), axis=1)
    print('processed')
    sents = processed.values
    new_model = Doc2Vec(sents, vector_size=300, epochs=40, dm=0, min_count=300)
    print('trained')
    new_model.save(name_of_model)
    print(f'saved {name_of_model}')

  from pandas import Panel


##### 1. Bring in training corpus
##### 2. Create X and y
##### 3. Train, Test split
##### 4. Create dataframes that I can feed to Doc2Vec

In [2]:
df = pd.read_csv("training_documents.csv", index_col=0)
df.head()
type(df['doc'][1])

str

In [3]:
df.head()

Unnamed: 0,user,doc,label
0,atensnut,Amazon and Big Tech cozy up to Biden camp with...,red
1,aubrey_huff,After watching the first debate I’m totally co...,red
2,benshapiro,Alligator pits underneath the podiums https://...,red
3,brieandjam1,Donald #Trump earns his money as a businessman...,red
4,BuzzPatterson,"As a traitor to our nation once said, “What di...",red


In [4]:
df['doc'] = df['doc'].progress_apply(lambda x: simple_preprocess(x))
type(df['doc'][1])

Applying: 100%|██████████| 163/163 [01:49<00:00,  1.49it/s]


list

In [5]:
df['label'].value_counts()

blue    87
red     76
Name: label, dtype: int64

In [6]:
df['label'] = df['label'].map({"red": 0, "blue": 1})
X = df['doc']
y = df['label']
print(y.value_counts(normalize=True))
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=32)
df_train = pd.DataFrame()
df_test = pd.DataFrame()
df_train['doc'] = X_train
df_test['doc'] = X_test
df_train['label'] = y_train
df_test['label'] = y_test

1    0.533742
0    0.466258
Name: label, dtype: float64


### Train Doc2Vec on training set

In [7]:
train_doc2vec_model(df_train, 'test_doc2vec')


processed
trained
saved test_doc2vec


### Infer Vectors for training set and testing set 

In [8]:
model = Doc2Vec.load('test_doc2vec')

In [9]:
df_train['vector'] = [model.infer_vector(list(x)) for x in tqdm(df_train['doc'])]

100%|██████████| 130/130 [01:30<00:00,  1.44it/s]


In [10]:
df_test['vector'] = [model.infer_vector(list(x)) for x in tqdm(df_test['doc'])]

100%|██████████| 33/33 [00:22<00:00,  1.48it/s]


In [11]:
df_test.shape

(33, 3)

In [12]:
df_train.shape

(130, 3)

## Begin Testing Models

### Support Vector Classifier

In [13]:
svc_test = SVC(probability=True)
svc_test.fit(list(df_train['vector']), df_train['label'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [14]:
train_hat = svc_test.predict(list(df_train['vector']))
print(accuracy_score(df_train['label'], train_hat))
test_hat = svc_test.predict(list(df_test['vector']))
print(accuracy_score(df_test['label'], test_hat))

0.9846153846153847
0.9696969696969697


In [16]:
df_test.head()

Unnamed: 0,doc,label,vector
42,"[weird, an, idea, sued, judicialwatch, in, cou...",0,"[0.5899123, 0.54074347, 0.03056826, -0.0020906..."
0,"[text, come, to, think, of, it, have, been, no...",1,"[0.04928346, -0.25602424, -0.19530252, -0.2115..."
24,"[thank, you, for, confirming, what, millions, ...",0,"[0.176241, 0.106442586, -0.05736669, -0.068369..."
0,"[text, gl, balsett, renzograciebjj, classes, o...",0,"[0.2952346, 0.46000347, 0.05900677, 0.06121755..."
0,"[text, did, the, coronavirus, plan, this, even...",1,"[0.09585878, 0.29221642, 0.085054874, 0.090226..."


### Logistic Regression

In [17]:
logreg_test = LogisticRegression()
logreg_test.fit(list(df_train['vector']), df_train['label'])
train_hat = logreg_test.predict(list(df_train['vector']))
print(accuracy_score(df_train['label'], train_hat))
test_hat = logreg_test.predict(list(df_test['vector']))
print(accuracy_score(df_test['label'], test_hat))

0.9923076923076923
1.0


### K Nearest Neighbor

In [18]:
knn_test = KNeighborsClassifier(n_neighbors=5)
knn_test.fit(list(df_train['vector']), df_train['label'])
train_hat = knn_test.predict(list(df_train['vector']))
print(accuracy_score(df_train['label'], train_hat))
test_hat = knn_test.predict(list(df_test['vector']))
print(accuracy_score(df_test['label'], test_hat))

0.9692307692307692
0.7575757575757576


### Quick Attempt at Clustering

In [19]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
cluster_preds = kmeans.fit_predict(list(df_train['vector']))

In [20]:
print(accuracy_score(df_train['label'], cluster_preds))

0.43846153846153846


### Train Final Doc2Vec Model and Final SVC Model

In [26]:
train_doc2vec_model(df, 'final_doc2vec')

processed
trained
saved final_doc2vec


In [28]:
final_doc2vec = Doc2Vec.load('final_doc2vec')

In [29]:
df['vector'] = [final_doc2vec.infer_vector(list(x)) for x in tqdm(df['doc'])]



  0%|          | 0/163 [00:00<?, ?it/s][A[A

  1%|          | 1/163 [00:00<01:49,  1.48it/s][A[A

  1%|          | 2/163 [00:01<01:48,  1.48it/s][A[A

  2%|▏         | 3/163 [00:02<01:51,  1.43it/s][A[A

  2%|▏         | 4/163 [00:02<01:50,  1.44it/s][A[A

  3%|▎         | 5/163 [00:03<01:50,  1.44it/s][A[A

  4%|▎         | 6/163 [00:04<01:50,  1.41it/s][A[A

  4%|▍         | 7/163 [00:04<01:50,  1.41it/s][A[A

  5%|▍         | 8/163 [00:05<01:50,  1.40it/s][A[A

  6%|▌         | 9/163 [00:06<01:50,  1.39it/s][A[A

  6%|▌         | 10/163 [00:07<01:47,  1.42it/s][A[A

  7%|▋         | 11/163 [00:07<01:47,  1.41it/s][A[A

  7%|▋         | 12/163 [00:08<01:44,  1.45it/s][A[A

  8%|▊         | 13/163 [00:09<01:45,  1.42it/s][A[A

  9%|▊         | 14/163 [00:09<01:46,  1.40it/s][A[A

  9%|▉         | 15/163 [00:10<01:43,  1.43it/s][A[A

 10%|▉         | 16/163 [00:11<01:42,  1.43it/s][A[A

 10%|█         | 17/163 [00:12<01:44,  1.40it/s][A[A

 11%|█  

In [30]:
svc_final = SVC(probability=True)
svc_final.fit(list(df['vector']), df['label'])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [32]:
import pickle


svc_filename = "svc_final_model.pkl"


with open(svc_filename, 'wb') as file:
    pickle.dump(svc_final, file)