# Fake news detection

## Imports

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

In [None]:
df_fake = pd.read_csv("gdrive/My Drive/Fake news detection/Fake.csv")
df_true = pd.read_csv("gdrive/My Drive/Fake news detection/True.csv")

## Pre-processing

In [None]:
# Add labels in dataset
df_true['class'] = 1
df_fake['class'] = 0

In [None]:
# Merge fake and true dataset
df_merge = pd.concat([df_fake, df_true], axis=0)

In [None]:
# Remove unwanted columns
df = df_merge.drop(["title", "subject", "date"], axis = 1)

In [None]:
# Shuffle dataset
df = df.sample(frac = 1)
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [None]:
# Function for pre-processing texts
def wordopt(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
df["text"] = df["text"].apply(wordopt)

In [None]:
x = df["text"]
y = df["class"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

## Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

## ML approaches

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train, y_train)

In [None]:
pred_lr = LR.predict(xv_test)

In [None]:
LR.score(xv_test, y_test)

0.9876169265033408

In [None]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5786
           1       0.99      0.99      0.99      5439

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [None]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [None]:
pred_dt = DT.predict(xv_test)

In [None]:
DT.score(xv_test, y_test)

0.996347438752784

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

In [None]:
pred_gbc = GBC.predict(xv_test)

In [None]:
GBC.score(xv_test, y_test)

0.9961692650334075

In [None]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

In [None]:
pred_rfc = RFC.predict(xv_test)

In [None]:
RFC.score(xv_test, y_test)

0.9928730512249443