In [27]:
#basic imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('combined_data.csv')

In [5]:
df.head()

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


In [6]:
sampled_df = df.sample(frac=0.2)

In [7]:
sampled_df['label'].value_counts()

label
1    8662
0    8028
Name: count, dtype: int64

In [8]:
#to check blanks
blanks = []
for i,lb,txt in sampled_df.itertuples():
    if type(txt) == str:
        if txt.isspace():
            blanks.append(i)
blanks

[]

In [9]:
sampled_df.dropna(inplace=True)
sampled_df.drop_duplicates(inplace=True)

### cleaning text using spacy, the lemma and stem

In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [11]:
def clean_text(text):
    doc = nlp(text.lower())
    clean_txt = ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
    return clean_txt

In [12]:
import time
start = time.time()
sampled_df['lemma_text'] = sampled_df['text'].apply(clean_text)
end = time.time()
print(end-start)

631.920508146286


In [13]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
sampled_df['stem_text'] = sampled_df['text'].apply(lambda x : stemmer.stem(x))

In [14]:
y = sampled_df['label']
X_stem = sampled_df['stem_text']
X_lemm = sampled_df['lemma_text']

In [15]:
#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=10000)
X_stem = tfidf.fit_transform(X_stem).toarray()
X_lemm = tfidf.fit_transform(X_lemm).toarray()

In [16]:
from sklearn.model_selection import train_test_split

In [61]:
X_train_stem,X_test_stem,y_train_stem,y_test_stem = train_test_split(X_stem,y,test_size=0.2)

In [62]:
X_train_lemm,X_test_lemm,y_train_lemm,y_test_lemm = train_test_split(X_lemm,y,test_size=0.2)

In [25]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train_stem,y_train_stem)
pred_dtree = dtree.predict(X_test_stem)
print(classification_report(y_test_stem,pred_dtree))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1624
           1       0.94      0.95      0.95      1714

    accuracy                           0.94      3338
   macro avg       0.94      0.94      0.94      3338
weighted avg       0.94      0.94      0.94      3338



In [26]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train_lemm,y_train_lemm)
pred_dtree = dtree.predict(X_test_lemm)
print(classification_report(y_test_lemm,pred_dtree))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1618
           1       0.94      0.95      0.94      1720

    accuracy                           0.94      3338
   macro avg       0.94      0.94      0.94      3338
weighted avg       0.94      0.94      0.94      3338



In [35]:
class Model(nn.Module):
    def __init__(self,in_size,h1=128,h2=64,h3=16,out_size=2,p=0.4):
        super(Model,self).__init__()
        self.fc1 = nn.Linear(in_size,h1)
        self.fc2 = nn.Linear(h1,h2)
        self.fc3 = nn.Linear(h2,h3)
        self.out = nn.Linear(h3,out_size)
        self.dropout = nn.Dropout(p)
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.out(x)
        return x

In [84]:
model = Model(X_train_lemm.shape[1])

In [85]:
num_epoch = 100
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

In [63]:
X_train_lemm,y_train_lemm = torch.Tensor(X_train_lemm),torch.LongTensor(y_train_lemm.values)
X_train_stem,y_train_stem = torch.Tensor(X_train_stem),torch.LongTensor(y_train_stem.values)
X_test_lemm,y_test_lemm = torch.Tensor(X_test_lemm),torch.LongTensor(y_test_lemm.values)
X_test_stem,y_test_stem = torch.Tensor(X_test_stem),torch.LongTensor(y_test_stem.values)

In [86]:
for epoch in range(num_epoch):
    y_pred = model.forward(X_train_lemm)
    loss = criterion(y_pred,y_train_lemm)
    print(f"epoch : {epoch+1} loss : {loss.item()}")
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    

epoch : 1 loss : 0.7160307168960571
epoch : 2 loss : 0.714565098285675
epoch : 3 loss : 0.7129852175712585
epoch : 4 loss : 0.7112254500389099
epoch : 5 loss : 0.7085047364234924
epoch : 6 loss : 0.706082820892334
epoch : 7 loss : 0.7032996416091919
epoch : 8 loss : 0.6999458074569702
epoch : 9 loss : 0.6963393092155457
epoch : 10 loss : 0.6922343373298645
epoch : 11 loss : 0.6877645254135132
epoch : 12 loss : 0.682601809501648
epoch : 13 loss : 0.6771962642669678
epoch : 14 loss : 0.6710528135299683
epoch : 15 loss : 0.6639567613601685
epoch : 16 loss : 0.6563292145729065
epoch : 17 loss : 0.6481838226318359
epoch : 18 loss : 0.6380200386047363
epoch : 19 loss : 0.6283541917800903
epoch : 20 loss : 0.6163973212242126
epoch : 21 loss : 0.6045504212379456
epoch : 22 loss : 0.5905366539955139
epoch : 23 loss : 0.5779467821121216
epoch : 24 loss : 0.5603973269462585
epoch : 25 loss : 0.5468283891677856
epoch : 26 loss : 0.5279354453086853
epoch : 27 loss : 0.5109264850616455
epoch : 28 lo

In [87]:
model.eval()
with torch.no_grad():
    y_eval = model.forward(X_test_lemm)

In [88]:
y_eval = torch.argmax(y_eval,1).numpy()

In [89]:
print(classification_report(y_test_lemm.numpy(),y_eval))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1575
           1       0.98      0.98      0.98      1763

    accuracy                           0.98      3338
   macro avg       0.98      0.98      0.98      3338
weighted avg       0.98      0.98      0.98      3338

