In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import spacy
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
nlp = spacy.load("en_core_web_sm") 

In [74]:
df=pd.read_csv('Fake_Real_Data.csv')
df

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real
...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake
9896,Trump consults Republican senators on Fed chie...,Real
9897,Trump lawyers say judge lacks jurisdiction for...,Real
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake


In [26]:
print(df.label.value_counts())

Fake    5000
Real    4900
Name: label, dtype: int64


In [27]:
df['label_num']=df['label'].map({
    'Fake':0,
    'Real':1
})

In [28]:
x_train,x_test,y_train,y_test=train_test_split(df['Text'],df['label_num'],test_size=0.2,random_state=42,stratify=df.label_num)

In [29]:
print(len(x_train))
print(len(x_test))

7920
1980


In [52]:
model = Pipeline([
    ('vectorizer_trigrams', CountVectorizer(ngram_range = (1,1))),                    
     ('KNN', (KNeighborsClassifier(n_neighbors=10, metric = 'euclidean')))                 
])
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
print(model.score(x_test,y_test))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92      1000
           1       0.91      0.94      0.92       980

    accuracy                           0.92      1980
   macro avg       0.92      0.92      0.92      1980
weighted avg       0.92      0.92      0.92      1980

0.9237373737373737


In [53]:
model1 = Pipeline([
    ('vectorizer_trigrams', CountVectorizer(ngram_range = (1,1))),                    
     ('logistic_regression', LogisticRegression())                 
])
model1.fit(x_train, y_train)
y_pred = model1.predict(x_test)
print(classification_report(y_test, y_pred))
print(model1.score(x_test,y_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

0.9984848484848485


In [54]:
model2 = Pipeline([
    ('vectorizer_trigrams', CountVectorizer(ngram_range = (1,1))),                    
     ('naive_bayes', MultinomialNB(alpha = 0.75))                 
])
model2.fit(x_train, y_train)
y_pred = model2.predict(x_test)
print(classification_report(y_test, y_pred))
print(model2.score(x_test,y_test))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1000
           1       0.97      0.97      0.97       980

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980

0.9727272727272728


In [55]:
model3 = Pipeline([
    ('vectorizer_trigrams', CountVectorizer(ngram_range = (1,2))),                    
     ('naive_bayes', MultinomialNB(alpha = 0.75))                 
])
model3.fit(x_train, y_train)
y_pred = model3.predict(x_test)
print(classification_report(y_test, y_pred))
print(model3.score(x_test,y_test))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1000
           1       0.98      0.98      0.98       980

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980

0.9838383838383838


In [56]:
model4 = Pipeline([
    ('vectorizer_trigrams', CountVectorizer(ngram_range = (1,1))),                    
     ('random_forest', RandomForestClassifier())                 
])
model4.fit(x_train, y_train)
y_pred = model4.predict(x_test)
print(classification_report(y_test, y_pred))
print(model4.score(x_test,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      0.99      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

0.996969696969697


In [58]:
best_model=model1
best_model

In [45]:
def preprocess(text):
    doc=nlp(text)
    filtered_tokens=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [46]:
df['preprocessed_text']=df['Text'].apply(preprocess)

In [48]:
df

Unnamed: 0,Text,label,label_num,preprocessed_text
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,Trump Surrogate BRUTALLY Stabs Pathetic vide...
1,U.S. conservative leader optimistic of common ...,Real,1,U.S. conservative leader optimistic common gro...
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,trump propose U.S. tax overhaul stir concern d...
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,Court Forces Ohio allow million illegally pu...
4,Democrats say Trump agrees to work on immigrat...,Real,1,Democrats Trump agree work immigration bill wa...
...,...,...,...,...
9895,Wikileaks Admits To Screwing Up IMMENSELY Wit...,Fake,0,Wikileaks admit screw IMMENSELY Twitter Poll...
9896,Trump consults Republican senators on Fed chie...,Real,1,trump consult republican senator Fed chief can...
9897,Trump lawyers say judge lacks jurisdiction for...,Real,1,trump lawyer judge lack jurisdiction defamatio...
9898,WATCH: Right-Wing Pastor Falsely Credits Trum...,Fake,0,WATCH Right Wing Pastor Falsely Credits Trum...


In [59]:
x_train,x_test,y_train,y_test=train_test_split(df['preprocessed_text'],df['label_num'],test_size=0.2,random_state=42,stratify=df.label_num)

In [60]:
model5 = Pipeline([
    ('vectorizer_trigrams', CountVectorizer(ngram_range = (1,1))),                    
     ('logistic_regression', LogisticRegression())                 
])
model5.fit(x_train, y_train)
y_pred = model5.predict(x_test)
print(classification_report(y_test, y_pred))
print(model5.score(x_test,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

0.998989898989899


In [61]:
# so we have concluded that logistic regression works best for our project

In [71]:
def fake_or_not(x):
    for i in x:
        if i == 1:
            print('Real')
        else:
            print('Fake')

In [72]:
email=[
    '  Press ask Sean Spicer prove Trump Left Businesses Response INFURIATING video Donald Trump team get absolutely hammer lie    call     alternate fact   rightfully    Trump hasn t President week team prove riddle falsehood complete lack transparency Monday new White House Press Secretary Sean Spicer give truly infuriate response White House press conference member press ask Spicer produce sort proof validate claim Trump resign business America get terrifying look year go bring Spicer say   believe   question document show Trump hand business son White House staffer Hope Hicks jump state document public   Spicer continue yeah resign company say take office Don Eric fully charge company s take extraordinary step ensure s happen secrecy Trump s team combine lack keep promise american people note ProPublica state Trump step away business Friday ProPublica examine Trump s major company company register different state    Florida Delaware New York state change company instantly    document pretty add system immediately day late ProPublica speak official state recently discover Trump submit filing transfer control company son watch Spicer lie tooth featured image Alex Wong Getty Images',
    'U.S. republican Senate campaign wing end fundraising tie Moore Politico WASHINGTON Reuters Republican Party Senate campaign wing sever fundraising deal Alabama republican Senate nominee Roy Moore Politico report Friday day allegation emerge initiate sexual encounter 14 year old girl 32 Politico say Federal Election Commission paperwork file Friday show National Republican Senatorial Committee long list joint fundraising committee campaign Moore 70 Alabama Republican Party Republican National Committee'
]

In [73]:
fake_or_not(model5.predict(email))

Fake
Real
