In [50]:
import pandas as pd
df = pd.read_csv('fake_or_real_news.csv')

In [51]:
df.shape

(6335, 4)

In [52]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [53]:
df.label.value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [54]:
df['label_num'] = df['label'].map({'FAKE' : 0, 'REAL' : 1})
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,label_num
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,1


Modelling without Pre-Processing

In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text, df.label_num, test_size=0.2,random_state = 2022, stratify = df.label)

In [56]:
X_train.shape
X_test.shape

(1267,)

kNN (Euclidian)

In [57]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range = (1,1))),  #unigram
    # ('vectorizer_bow',CountVectorizer(ngram_range = (1,2))),  #bigrams
    # ('vectorizer_bow',CountVectorizer(ngram_range = (1,3))),  #trigrams
    ('kNN',KNeighborsClassifier(n_neighbors= 10,metric='minkowski') )
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        FAKE       0.75      0.83      0.79       633
        REAL       0.81      0.73      0.77       634

    accuracy                           0.78      1267
   macro avg       0.78      0.78      0.78      1267
weighted avg       0.78      0.78      0.78      1267



kNN (Cosine)

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range = (1,1))),
    ('kNN',KNeighborsClassifier(n_neighbors= 10,metric='cosine') )
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        FAKE       0.78      0.76      0.77       633
        REAL       0.77      0.79      0.78       634

    accuracy                           0.78      1267
   macro avg       0.78      0.78      0.78      1267
weighted avg       0.78      0.78      0.78      1267



RandomForest, cv with trigram

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('cv', CountVectorizer(ngram_range=(3,3))),
    ('rff', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        FAKE       0.81      0.93      0.86       633
        REAL       0.92      0.78      0.84       634

    accuracy                           0.85      1267
   macro avg       0.86      0.85      0.85      1267
weighted avg       0.86      0.85      0.85      1267



Multinomial Naive Bayes

In [60]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('cv', CountVectorizer(ngram_range = (1,1))),
    ('Multi MB',MultinomialNB(alpha = 0.75))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        FAKE       0.93      0.85      0.89       633
        REAL       0.86      0.93      0.90       634

    accuracy                           0.89      1267
   macro avg       0.89      0.89      0.89      1267
weighted avg       0.89      0.89      0.89      1267



Using pre-processing

In [61]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text) :
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if(token.is_stop or token.is_punct):
      continue
    filtered_tokens.append(token.lemma_)

  return ' '.join(filtered_tokens)

In [None]:
df['preprocessed_txt'] = df['text'].apply(preprocess)

In [None]:
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.preprocessed_txt, df.label_num, test_size=0.2,random_state = 2022, stratify = df.label_num)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('cv', CountVectorizer(ngram_range=(1,3))),
    ('rfc', RandomForestClassifier())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))