# Classification of Fake and Real news

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')
import re
import nltk
from nltk.corpus import stopwords
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from scipy.sparse import hstack
from scipy.sparse import save_npz

# reading data

In [None]:
fake_data=pd.read_csv('/content/drive/My Drive/news_dataset/Fake.csv')
real_data=pd.read_csv('/content/drive/My Drive/news_dataset/True.csv')

In [None]:
fake_data.head()

In [None]:
real_data.head()

#setting label as new column "True". value of True will be 1 if the news is real else it qill be fake

In [None]:
real_data['True']=1

In [None]:
fake_data['True']=0

# Concatenating both true and false datasets to create singel dataset consisting of both true and false news

In [None]:
data=pd.concat([real_data,fake_data],axis=0)

# shuffling the data to get True and False news data points in random order

In [None]:
data=data.sample(frac=1).reset_index()

In [None]:
data.reset_index()

In [None]:
data.head()

In [None]:
data.drop(columns=['index'],inplace=True)

In [None]:
data.head()

In [None]:
data[data['True']==1].shape

In [None]:
data[data['True']==0].shape

In [None]:
data.shape

In [None]:
data.to_csv('complete_data.csv')

**there are 21,417 true news data points, 23,481 false news data points, in total there are 44,898 data points**

In [None]:
data['subject'].value_counts()

# Pre-processing text data

## downloading stopwords from nltk

In [None]:
nltk.download()

In [None]:
stop_words = set(stopwords.words('english')) 

# Functions for preprocessing text data

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
from tqdm import tqdm
def preprocess_text(text_data):
    preprocessed_text = []
    # tqdm is for printing the status bar
    for sentance in tqdm(text_data):
        sent = decontracted(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\n', ' ')
        sent = sent.replace('\\"', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        sent = ' '.join(e for e in sent.split() if e.lower() not in stop_words)
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

## pre-processing 'title' feature

In [None]:
preprocessed_title = preprocess_text(data['title'].values)

In [None]:
preprocessed_title[:5]

## pre-processing 'text' feature

In [None]:
preprocessed_text = preprocess_text(data['text'].values)

In [None]:
preprocessed_text[:5]

In [None]:
data['preprocessed_title']=preprocessed_title
data['preprocessed_text']=preprocessed_text

In [None]:
data['length_of_title']=data['preprocessed_title'].apply(lambda x:len(x.split(' ')))

In [None]:
data['length_of_text']=data['preprocessed_text'].apply(lambda x:len(x.split(' ')))

In [None]:
data['subject'].value_counts()

In [None]:
data.head()

# doing some exploration

In [None]:
# let us see if 'length_of_title' feature can differentiate between fake(0) and true(1) news
plt.figure(figsize=(6,8))
sns.boxplot(x='True',y='length_of_title',data=data)
plt.show()

**we can clearly see that 'length_of_title' feature can clearly distingush between fake news and true news. this might even prove to be most useful feature. true news has lesser length of title than fake news**

In [None]:
# let us see if 'length_of_text' feature can differentiate between fake(0) and true(1) news
plt.figure(figsize=(6,8))
sns.boxplot(x='True',y='length_of_text',data=data)
plt.show()

**'length_of_text' don not seem to be good feature for classification**

In [None]:
type(data['date'][0])

In [None]:
data['date'][0]

In [None]:
# format: %m %d, %y
pd.to_datetime(data['date'])

### there are some data points for which 'date' feature has some other lengthy non date strings. identifying such rows and removing them

In [None]:
ind=[]
for i,row in data.iterrows():
  if len(row['date'])>20:
    ind.append(i)


In [None]:
ind

In [None]:
for i in ind:
  print(data.loc[i,'date'])

In [None]:
data.drop(index=ind,inplace=True)

In [None]:
# format: %m %d, %y
data['date']=pd.to_datetime(data['date'])

In [None]:
data.head()

In [None]:
data['subject'].unique()

In [None]:
plt.figure(figsize=(14,6))
sns.countplot(x='subject',hue='True',data=data)

**so fake news seem to be appearing for under subjects: 'politics', 'left-news', 'News', 'US_News', 'Government News', 'Middle-east'. true news seem to be appearing under subjects 'politicsNews', worldnews.**

**So we may come up with a feature such that if subject is 'politicsNews' or 'worldnews' then there is a high probability that news is true and if subject is other than these two then there is high probability that the news is false. But we do not want to be deterministic about this.**

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(x='length_of_title',y='length_of_text',hue='True',data=data)

**there sure seems to be some kind of clustering but also there is some overlapping of points, fake and true data points seem to be seperable based on lengths of title and text**

## Writing a function to calculate average length of word in a document/sentence

In [None]:
def avg_length_of_words(sent):
  word_list=sent.split(' ')
  sum=0
  for word in word_list:
    sum+=len(word)
  avg_length=sum/len(word_list)
  return np.round(avg_length,2)


## adding columns 'avg_length_title' and 'avg_length_text'

In [None]:
data['avg_length_title']=data['title'].apply(avg_length_of_words)
data['avg_length_text']=data['text'].apply(avg_length_of_words)

In [None]:
data.head()

## let us see if these new features are useful

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='avg_length_title',data=data)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='avg_length_text',data=data)
plt.show()

**not too useful but in case of 'avg_length_text' there seem to be too many outliers i.e. too many false news text have average length of words very high. this might be useful**

## getting sentiment scores

In [None]:
nltk.download('vader_lexicon')

In [None]:
pip install twython

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## 'title' sentiment scores

In [None]:
sid = SentimentIntensityAnalyzer()
title_neg_sent=[]
title_pos_sent=[]
title_neu_sent=[]
title_compound_sent=[]
for doc in data['title'].values:
  ss = sid.polarity_scores(doc)
  title_neg_sent.append(ss['neg'])
  title_pos_sent.append(ss['pos'])
  title_neu_sent.append(ss['neu'])
  title_compound_sent.append(ss['compound'])

In [None]:
title_neg_sent=np.array(title_neg_sent)
title_pos_sent=np.array(title_pos_sent)
title_neu_sent=np.array(title_neu_sent)
title_compound_sent=np.array(title_compound_sent)

## 'text' sentiment scores

In [None]:
sid = SentimentIntensityAnalyzer()
text_neg_sent=[]
text_pos_sent=[]
text_neu_sent=[]
text_compound_sent=[]
for doc in data['text'].values:
  ss = sid.polarity_scores(doc)
  text_neg_sent.append(ss['neg'])
  text_pos_sent.append(ss['pos'])
  text_neu_sent.append(ss['neu'])
  text_compound_sent.append(ss['compound'])

In [None]:
text_neg_sent=np.array(text_neg_sent)
text_pos_sent=np.array(text_pos_sent)
text_neu_sent=np.array(text_neu_sent)
text_compound_sent=np.array(text_compound_sent)

### Adding these to the main data dataframe

#### adding text sentiments to 'data' dataframe(main data frame)

In [None]:
text_sentiment=np.vstack((text_pos_sent,text_neg_sent,text_neu_sent,text_compound_sent)).T

In [None]:
text_sentiment

In [None]:
text_sentiment_df=pd.DataFrame(data=text_sentiment,columns=['text_pos_sent','text_neg_sent','text_neu_sent','text_compound_sent'])

In [None]:
data=pd.concat([data,text_sentiment_df],axis=1)

#### adding title sentiments to 'data' dataframe(main data frame)

In [None]:
title_sentiment=np.vstack((title_pos_sent,title_neg_sent,title_neu_sent,title_compound_sent)).T

In [None]:
title_sentiment_df=pd.DataFrame(data=title_sentiment,columns=['title_pos_sent','title_neg_sent','title_neu_sent','title_compound_sent'])

In [None]:
data=pd.concat([data,title_sentiment_df],axis=1)

In [None]:
data.head(1)

## let us explore these sentiment score features a little

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='title_pos_sent',data=data)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='title_neg_sent',data=data)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='title_neu_sent',data=data)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='title_compound_sent',data=data)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='text_pos_sent',data=data)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='text_neg_sent',data=data)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='text_neu_sent',data=data)
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='True',y='text_compound_sent',data=data)
plt.show()

**So basic boxplot analysis shows that these sentiment scores may not be much useful in classification. still we will keep it and train a model with it and see the performance of the model and them again we will train a model without these features and check the performance of the model and see which is better**

## let us see if 'date' feature helps us to classify flase and true news

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x=data['True'],y=data['date'].apply(lambda x:x.month))

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x=data['True'],y=data['date'].apply(lambda x:x.weekday()))

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x=data['True'],y=data['date'].apply(lambda x:x.day))

**month might be a little useful but not much**

In [None]:
data.to_csv('data_stage_1.csv')

loading data


In [None]:
data=pd.read_csv('/content/drive/My Drive/news_dataset/data_stage_1.csv',index_col='date')

In [None]:
data.head()

In [None]:
data.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
data.head(1)

In [None]:
sns.heatmap(data.isna())

In [None]:
data.dropna(inplace=True)

In [None]:
sns.heatmap(data.isnull())

# Featurization of text data

Using pretrained global vectors model for getting word embeddings

In [None]:
import pickle
with open('/content/drive/My Drive/news_dataset/glove_vectors', 'rb') as f:
  model=pickle.load(f)
  glove_words=set(model.keys())

We need to get vocabulary based on train data only to avoide data leakage. hence splitting the data into train and test data sets

## Train-Test Split

In [None]:
Y=data['True'].values

In [None]:
X=data.drop(columns=['True'])

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, stratify=Y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, stratify=Y)

## TF-IDF weighted W2V vectorization of 'title' feature

### train data

In [None]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(x_train['preprocessed_title'])
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

In [None]:
# computing tfidf word2vec for each title.
from tqdm import tqdm
tfidf_w2v_vectors_title_tr = []; # the avg-w2v for each title is stored in this list
for sentence in tqdm(x_train['preprocessed_title']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_title_tr.append(vector)


### test data

Using vocabulary from train data only 

In [None]:
# computing tfidf word2vec for each title.
from tqdm import tqdm
tfidf_w2v_vectors_title_te = []; # the avg-w2v for each title is stored in this list
for sentence in tqdm(x_test['preprocessed_title']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_title_te.append(vector)


In [None]:
len(tfidf_w2v_vectors_title_tr[0])

## TF-IDF weighted W2V vectorization of 'text' feature

getting tfidf vocab for 'text' based on train data only

In [None]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(x_train['preprocessed_text'])
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

Vectorizing 'text' of train data

In [None]:
# computing tfidf word2vec for each title.
from tqdm import tqdm
tfidf_w2v_vectors_text_tr = []; # the avg-w2v for each title is stored in this list
for sentence in tqdm(x_train['preprocessed_text']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_text_tr.append(vector)


vectorizing 'text' of test data using vocab of train data only

In [None]:
# computing tfidf word2vec for each title.
from tqdm import tqdm
tfidf_w2v_vectors_text_te = []; # the avg-w2v for each title is stored in this list
for sentence in tqdm(x_test['preprocessed_text']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_text_te.append(vector)


tfidf_w2v_vectors_title_tr, tfidf_w2v_vectors_title_te, 
tfidf_w2v_vectors_text_tr, tfidf_w2v_vectors_text_te, 

In [None]:
x_train.head(2)

# One hot encoding 'Subject' feature

In [None]:
vectorizer = CountVectorizer()
text_bow = vectorizer.fit(x_train['subject'])

In [None]:
subject_onehot_train=vectorizer.transform(x_train['subject'])

In [None]:
subject_onehot_test=vectorizer.transform(x_test['subject'])

# getting required features for modelling and descarding rest

# getting X_train data ready in sparse matrix form

In [None]:
updated_x_train=x_train.drop(columns=['title','text','subject','preprocessed_title','preprocessed_text'])

In [None]:
updated_x_train.head(2)

In [None]:
updated_x_train=updated_x_train.values

In [None]:
updated_x_train=np.hstack((updated_x_train,tfidf_w2v_vectors_title_tr,tfidf_w2v_vectors_text_tr))

In [None]:
updated_x_train=sparse.csr_matrix(updated_x_train)

In [None]:
updated_x_train

In [None]:
X_train=hstack([updated_x_train,subject_onehot_train])

# getting X_test data ready in sparse matrix form  

In [None]:
updated_x_test=x_test.drop(columns=['title','text','subject','preprocessed_title','preprocessed_text'])

In [None]:
updated_x_test.head(2)

In [None]:
updated_x_test=updated_x_test.values

In [None]:
updated_x_test=np.hstack((updated_x_test,tfidf_w2v_vectors_title_te,tfidf_w2v_vectors_text_te))

In [None]:
updated_x_test=sparse.csr_matrix(updated_x_test)

In [None]:
updated_x_test

In [None]:
X_test=hstack([updated_x_test,subject_onehot_test])

# checking the lengths of final X_train and X_test data matrices

In [None]:
X_test.shape

In [None]:
X_train.shape

In [None]:
y_test.shape

In [None]:
y_train.shape

In [None]:
y_test

In [None]:
y_train

# saving the model ready data for future use

In [None]:
save_npz('X_train.npz', X_train)
save_npz('X_test.npz', X_test)
np.save('y_test.npy',y_test)
np.save('y_train.npy',y_train)

to load sparse matrix: sparse_matrix = scipy.sparse.load_npz('/tmp/sparse_matrix.npz')

# training GaussianNB 

In [None]:
X_train=X_train.toarray()
X_test=X_test.toarray()

## RandomSearchCV

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score,confusion_matrix,classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import make_scorer
scoring = make_scorer(roc_auc_score,f1_score)

In [None]:
params={'var_smoothing':[10**-10,10**-9,10**-8,10**-11,10**-7]}
nb=GaussianNB()
clf=RandomizedSearchCV(estimator=nb,param_distributions=params,cv=5,scoring=scoring)
clf.fit(X_train,y_train)

In [None]:
clf.best_estimator_

## training model with best parameters

In [None]:
nb=GaussianNB(priors=None,var_smoothing=10**-10)
nb.fit(X_train,y_train)

## predictions on test data and checking performance using confusion matrix and accuracy score

In [None]:
pred=nb.predict(X_test)

In [None]:
cm=confusion_matrix(y_test,pred)

In [None]:
print(cm)

In [None]:
sns.heatmap(cm,cbar=False)

# All data points in test data correctly classified. 100% accuracy

In [None]:
from sklearn.metrics import accuracy_score


In [None]:
accuracy_score(y_test,pred)

# saving the model in a binary file

In [None]:
import pickle

In [None]:
Pkl_Filename = "news_model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(nb, file)

to load model

In [None]:
with open("news_model.pkl", 'rb') as file:  
    nb = pickle.load(file)

nb