In [1]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('Fake_Real_Data.csv')

In [3]:
df

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real
...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake
9896,Trump consults Republican senators on Fed chie...,Real
9897,Trump lawyers say judge lacks jurisdiction for...,Real
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake


In [4]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [5]:
def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return ' '.join(filtered_tokens)  

In [9]:
df['label_num'] = df.label.apply(lambda x : 1 if x =='Real' else 0)

In [10]:
df

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1
...,...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake,0
9896,Trump consults Republican senators on Fed chie...,Real,1
9897,Trump lawyers say judge lacks jurisdiction for...,Real,1
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake,0


In [15]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(df.Text, df.label_num, random_state= 2000, test_size=0.2)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.naive_bayes import MultinomialNB as mnb

In [17]:
clf = Pipeline([
    ('vec', CountVectorizer(ngram_range=(1,3))),
    ('knn', knn(n_neighbors=10, metric='euclidean'))
]) 

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.47      0.63       956
           1       0.66      0.98      0.79      1024

    accuracy                           0.73      1980
   macro avg       0.81      0.72      0.71      1980
weighted avg       0.80      0.73      0.71      1980



In [18]:
clf = Pipeline([
    ('vec', CountVectorizer(ngram_range=(1,3))),
    ('knn', knn(n_neighbors=10, metric='cosine'))
]) 

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.53      0.70       956
           1       0.70      1.00      0.82      1024

    accuracy                           0.77      1980
   macro avg       0.85      0.77      0.76      1980
weighted avg       0.84      0.77      0.76      1980



In [24]:
clf = Pipeline([
    ('vec', CountVectorizer(ngram_range=(3,3))),
    ('rfc', rfc())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       956
           1       0.98      1.00      0.99      1024

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [25]:
clf = Pipeline([
    ('vec', CountVectorizer(ngram_range=(3,3))),
    ('mnb', mnb())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       956
           1       0.99      0.99      0.99      1024

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [26]:
df['preprocessed_text'] = df.Text.apply(preprocess) 

In [27]:
df

Unnamed: 0,Text,label,label_num,preprocessed_text
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,Trump Surrogate BRUTALLY stab pathetic video...
1,U.S. conservative leader optimistic of common ...,Real,1,U.S. conservative leader optimistic common gro...
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,trump propose U.S. tax overhaul stir concern d...
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,Court Forces Ohio allow Millions illegally p...
4,Democrats say Trump agrees to work on immigrat...,Real,1,democrat trump agree work immigration bill wal...
...,...,...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake,0,Wikileaks admit screwing IMMENSELY Twitter P...
9896,Trump consults Republican senators on Fed chie...,Real,1,trump consult republican senator Fed chief can...
9897,Trump lawyers say judge lacks jurisdiction for...,Real,1,trump lawyer judge lack jurisdiction defamatio...
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake,0,WATCH right wing pastor Falsely Credits Trum...


In [29]:
X_train, X_test, y_train, y_test = tts(df.preprocessed_text, df.label_num, random_state= 2000, test_size=0.2, stratify = df.label_num)

In [31]:
clf = Pipeline([
    ('vec', CountVectorizer(ngram_range=(3,3))),
    ('rfc', rfc())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      1000
           1       0.98      0.91      0.94       980

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980



In [32]:
clf = Pipeline([
    ('vec', CountVectorizer(ngram_range=(1,3))),
    ('rfc', rfc())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1000
           1       1.00      0.99      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



In [35]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)


In [36]:
cm

array([[1000,    0],
       [   9,  971]], dtype=int64)