# Loading Necessary Modules

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib

# Loading Data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
PATH='/kaggle/input/fake-and-real-news-dataset'
TRUE_FILE_PATH=os.path.join(PATH,'True.csv')
FAKE_FILE_PATH=os.path.join(PATH,'Fake.csv')

In [None]:
true_data_df=pd.read_csv(TRUE_FILE_PATH)
true_class=['True' for index in range(true_data_df.shape[0])]
fake_data_df=pd.read_csv(FAKE_FILE_PATH)
fake_class=['Fake' for index in range(fake_data_df.shape[0])]

# Exploratory Data Analysis and pre processing

In [None]:
labels=['True','Fake']
class_wise_counts=[true_data_df.shape[0],fake_data_df.shape[0]]

In [None]:
matplotlib.rcParams['figure.figsize']=(10,10)
plt.bar(labels,class_wise_counts,align='center', alpha=0.5,color='r')
plt.xlabel('Classes')
plt.ylabel('Counts')
plt.title('Count vs Classes')
plt.show()
print ("Ratio of fake is to real news:",(fake_data_df.shape[0]/true_data_df.shape[0]))

In [None]:
true_data_df['class']=true_class
fake_data_df['class']=fake_class

In [None]:
fake_data_df['class']=fake_class

In [None]:
true_data_df.head()

In [None]:
fake_data_df.head()

In [None]:
data_frame=pd.concat([true_data_df,fake_data_df],axis='rows')

In [None]:
data_frame.isnull().sum()

In [None]:
data_frame.head()

In [None]:
data_frame.date.value_counts()

## Date contains a lot of unique values so not much value can be extracted from it hence dropping it for now

In [None]:
data_frame.drop('date',axis='columns',inplace=True)

In [None]:
data_frame.head()

## Looking in subject feature 

In [None]:
data_frame.subject.unique()

In [None]:
real_news_df=data_frame[data_frame.subject=='politicsNews']

In [None]:
real_news_df.shape

In [None]:
(fake_subject_keys,fake_counts)=np.unique(data_frame[data_frame['class']=='Fake'].subject,return_counts=True)
(true_subject_keys,true_counts)=np.unique(data_frame[data_frame['class']=='True'].subject,return_counts=True)

In [None]:
matplotlib.rcParams['figure.figsize']=(10,10)
plt.bar(fake_subject_keys,fake_counts,align='center', alpha=0.5,color='g')
plt.xlabel('Subjects')
plt.ylabel('Counts')
plt.title('FakeNewsCounts vs Subjects')
plt.show()

In [None]:
matplotlib.rcParams['figure.figsize']=(10,7)
plt.bar(true_subject_keys,true_counts,align='center', alpha=0.5,color='b')
plt.xlabel('Subjects')
plt.ylabel('Counts')
plt.title('TrueNewsCounts vs Subjects')
plt.show()

## So only politicalNews and worldnews are giving true news remaning all of them are giving fake news

## Converting the subject feature into one hot encoded features

In [None]:
subject_dummies=pd.get_dummies(data_frame.subject)

In [None]:
data_frame2=pd.concat([data_frame,subject_dummies],axis='columns')

## Cleaning the title and text seperatly

In [None]:
title_column=list(data_frame2.title)
text_column=list(data_frame2.text)

In [None]:
title_column[0]

## Cleaning the title and text columns using NLTK

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

In [None]:
stop_words=stopwords.words('english')
stop_words.extend(string.punctuation)

In [None]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
lemmatizer=WordNetLemmatizer()

def clean_data(text):
    
    clean_words=[]
    words=word_tokenize(text)
    for word in words:
        if (word.lower() not in stop_words and word.isdigit()==False):
            curr_word_pos_tag=pos_tag([word])
            
            simple_pos_tag=get_wordnet_pos(curr_word_pos_tag[0][1])
            clean_words.append(lemmatizer.lemmatize(word,simple_pos_tag))
    return clean_words

clean_title_column=[clean_data(current_column) for current_column in title_column]


In [None]:
clean_title_column[0]

In [None]:
clean_text_column=[clean_data(current_column) for current_column in text_column]

## Now we have a list of list where each item contains the words that are not stop words in the current text.

## Vectorising them so that important words can be extracted from it for converting into features

In [None]:
clean_title_column_list=[" ".join(list_words) for list_words in clean_title_column]
clean_text_column_list=[" ".join(list_words) for list_words in clean_text_column]

In [None]:
data_frame2['title']=clean_title_column_list
data_frame2['text']=clean_text_column_list

## Shuffling the dataframe so that we can split into train and test sets

In [None]:
from sklearn.utils import shuffle
data_frame3 = shuffle(data_frame2)

In [None]:
data_frame3.reset_index(inplace=True, drop=True)

## Splitting the data into 75% for training and 25% for testing

In [None]:
train_dataframe=data_frame3.loc[:int(0.75*data_frame3.shape[0]),:]

In [None]:
test_dataframe=data_frame3.loc[int(0.75*data_frame3.shape[0]):,:]


In [None]:
yTrain=list(train_dataframe['class'])
yTest=list(test_dataframe['class'])

## Since the subject features 'class' and 'subject' have already been taken care of hence dropping them. 

In [None]:
train_dataframe.drop(['class','subject'],axis=1,inplace=True)
test_dataframe.drop(['class','subject'],axis=1,inplace=True)

In [None]:
test_dataframe.reset_index(inplace=True,drop=True)
test_dataframe.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [None]:
train_title_column=list(train_dataframe['title'])
train_text_column=list(train_dataframe['text'])
test_title_column=list(test_dataframe['title'])
test_text_column=list(test_dataframe['text'])

In [None]:
train_dataframe.drop(['title','text'],axis=1,inplace=True)
test_dataframe.drop(['title','text'],axis=1,inplace=True)

## Vectorisation for 'title' feature

In [None]:
count_vec=CountVectorizer(max_features=5000,ngram_range=(1,2))

In [None]:
train_title_sparse_matrix=count_vec.fit_transform(train_title_column)

In [None]:
test_title_sparse_matrix=count_vec.transform(test_title_column)

In [None]:
test_title_sparse_matrix.shape

## Converting the sparse matrix to dataframe for train and test set

In [None]:
train_dataframe_title = pd.DataFrame.sparse.from_spmatrix(train_title_sparse_matrix,columns=count_vec.get_feature_names())

In [None]:
test_dataframe_title=pd.DataFrame.sparse.from_spmatrix(test_title_sparse_matrix,columns=count_vec.get_feature_names())

In [None]:
train_dataframe_title.head()

In [None]:
test_dataframe_title.head()

## Adding the features extracted from the 'title' column as features to the train and test set

In [None]:
train_dataframe1=pd.concat([train_dataframe,train_dataframe_title],axis='columns')

In [None]:
train_dataframe1.head()

In [None]:
test_dataframe1=pd.concat([test_dataframe,test_dataframe_title],axis='columns')

In [None]:
test_dataframe1.head()

## Vectorisation for 'text' Column

In [None]:
count_vec_text=CountVectorizer(max_features=5000,ngram_range=(1,2))

In [None]:
train_text_sparse_matrix=count_vec_text.fit_transform(train_text_column)

In [None]:
test_text_sparse_matrix=count_vec_text.transform(test_text_column)

## Converting the sparse matrix to train and test set

In [None]:
train_dataframe_text = pd.DataFrame.sparse.from_spmatrix(train_text_sparse_matrix,columns=count_vec_text.get_feature_names())

In [None]:
train_dataframe_text.head()

In [None]:
test_dataframe_text=pd.DataFrame.sparse.from_spmatrix(test_text_sparse_matrix,columns=count_vec_text.get_feature_names())

In [None]:
test_dataframe_text.head()

## Adding the features extracted from 'text' column to the train and test set

In [None]:
train_dataframe2=pd.concat([train_dataframe1,train_dataframe_text],axis='columns')

In [None]:
test_dataframe2=pd.concat([test_dataframe1,test_dataframe_text],axis='columns')

In [None]:
train_dataframe2.head()

In [None]:
test_dataframe2.head()

In [None]:
train_dataframe2.isnull().sum()

In [None]:
test_dataframe2.isnull().sum()

In [None]:
train_dataframe2.shape

In [None]:
test_dataframe2.shape

In [None]:
train_dataframe2.shape,test_dataframe2.shape,yTrain.shape,yTest.shape

In [None]:
xTrain=train_dataframe2.values
xTest=test_dataframe2.values

In [None]:
xTrain.shape,xTest.shape,yTrain.shape,yTest.shape

# Training model

## Logisitc Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(max_iter=1000)
lr.fit(xTrain,yTrain)

In [None]:
yPredicted=lr.predict(xTest)

In [None]:
lr.score(xTest,yTest)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
print (confusion_matrix(yTest,yPredicted))

In [None]:
print (classification_report(yTest,yPredicted))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf_rf=RandomForestClassifier()
clf_rf.fit(xTrain,yTrain)

In [None]:
clf_rf.score(xTest,yTest)

In [None]:
yPredicted_rf=clf_rf.predict(xTest)

In [None]:
print (confusion_matrix(yTest,yPredicted_rf))

In [None]:
print (classification_report(yTest,yPredicted_rf))

## Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
clf_mnb=MultinomialNB()

In [None]:
clf_mnb.fit(xTrain,yTrain)

In [None]:
clf_mnb.score(xTest,yTest)

In [None]:
yPredicted_mnb=clf_mnb.predict(xTest)

In [None]:
confusion_matrix(yTest,yPredicted_mnb)

In [None]:
print (classification_report(yTest,yPredicted_mnb))