# Fake News Detection

#### Importing the Dependencies

In [None]:
#core python libraries
import pandas as pd
import numpy as np
import re
import string

#NLP Tools(Text Processing)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Machine Learning(Modelling)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

#### Loading the Data

In [None]:
data_fake = pd.read_csv('Fake.csv')
data_true = pd.read_csv('True.csv')

#### Labelling the Data

In [None]:
data_fake['class'] = 0
data_true['class'] = 1

#### Combining and shuffling the data

In [None]:
data = pd.concat([data_fake, data_true], axis = 0)
data = data.sample(frac=1)

In [None]:
data.head()

In [None]:
data = data.reset_index(drop = True)

In [None]:
data.head()

In [None]:
data = data.drop(['title', 'subject', 'date'], axis = 1)

In [None]:
data.head()

#### Definig clean_text function and applying it to data

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower() #lowercase
    text = re.sub(r'https?://\S+|www.\.\S+', '', text) #removes links
    text = re.sub(r'<.*?>+', '', text) # removes HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text) #removes special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip() #removes extra spaces

    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

In [None]:
data['text'] = data['text'].apply(clean_text)

#### Creating Dependent and Independent Variables

In [None]:
x = data['text']
y = data['class']

#### Splitting training and testing data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 2569, stratify = y)

In [None]:
print(y_train.value_counts(), '\n') #checking the balance
print(y_test.value_counts())

#### Text Vectorization with TF-IDF

In [None]:
vectorizer = TfidfVectorizer()
xv_train = vectorizer.fit_transform(x_train)
xv_test = vectorizer.transform(x_test)

#### Defining a function for evaluating the Model

In [None]:
def model_eval(y_pred):
    print('Accuracy of the Model: ', accuracy_score(y_test, y_pred))
    print('Classification Report:\n', classification_report(y_test, y_pred))

#### Defining a function for Visualizing Confusion Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def visualize_cm(y_pred):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')

    return plt.show() 

#### Model Training and Visualizing Confusion Matrix

##### Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

LR_Model = LogisticRegression()
LR_Model.fit(xv_train, y_train)

LR Model Evalution

In [None]:
y_pred_lr = LR_Model.predict(xv_test)
model_eval(y_pred_lr)

Visualizing Confusion Matrix for LR Model

In [None]:
visualize_cm(y_pred_lr)

##### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DT Model Evalution

In [None]:
y_pred_dt = DT.predict(xv_test)
model_eval(y_pred_dt)

Visualizing Confusion Matrix for DT Model

In [None]:
visualize_cm(y_pred_dt)

##### Gradient Boosting Classifier Model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GB = GradientBoostingClassifier()
GB.fit(xv_train, y_train)

GB Model Evalution

In [None]:
y_pred_gb = GB.predict(xv_test)
model_eval(y_pred_gb)

Visualizing Confusion Matrix for GB Model

In [None]:
visualize_cm(y_pred_gb)

##### Random Forest Classifier Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()
RF.fit(xv_train, y_train)

RF Model Evalution

In [None]:
y_pred_rf = RF.predict(xv_test)
model_eval(y_pred_rf)

Visualizing Confusion Matrix for RF Model

In [None]:
visualize_cm(y_pred_rf)

#### Saving the Best Model and Vectorizer

In [None]:
import pickle 

pickle.dump(LR, open('model.pkl', 'wb')) #since GradientBoostingClassifier has high accuracy
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb')) # saving the TF-IDF vectorizer

In [None]:
import os
os.getcwd()

In [None]:
!streamlit run app.py
5+

In [None]:
import os
os.getcwd()