In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("Fake_Real_Data.csv")

df.head()

In [5]:
df.shape
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [14]:
# fake = 0 , Real = 1

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.label = le.fit_transform(df.label)


In [15]:
print(df.head())

                                                Text  label
0   Top Trump Surrogate BRUTALLY Stabs Him In The...      0
1  U.S. conservative leader optimistic of common ...      1
2  Trump proposes U.S. tax overhaul, stirs concer...      1
3   Court Forces Ohio To Allow Millions Of Illega...      0
4  Democrats say Trump agrees to work on immigrat...      1


In [16]:
X = df.Text
y = df.label

# Modeling without Preprocessing

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42,stratify = y)

In [19]:
y_train.value_counts()

label
0    3500
1    3430
Name: count, dtype: int64

In [20]:
#1. create a pipeline object

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=10,))
])



#2. fit with X_train and y_train
clf.fit(X_train,y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.90      0.95      0.93      1425
           1       0.95      0.91      0.93      1545

    accuracy                           0.93      2970
   macro avg       0.93      0.93      0.93      2970
weighted avg       0.93      0.93      0.93      2970



In [21]:
#1. create a pipeline object

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier())
])



#2. fit with X_train and y_train
clf.fit(X_train,y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1506
           1       1.00      1.00      1.00      1464

    accuracy                           1.00      2970
   macro avg       1.00      1.00      1.00      2970
weighted avg       1.00      1.00      1.00      2970



# With preprocessing Modeling

In [31]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [32]:
df.processed_text = df.Text.apply(preprocess)

  df.processed_text = df.Text.apply(preprocess)


In [None]:
X = df.processed_text
y= df.label

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42,stratify = y)

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('knn', KNeighborsClassifier(n_neighbors=10,))
])



#2. fit with X_train and y_train
clf.fit(X_train,y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_pred,y_test))