In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install pyspark

In [None]:
import pyspark
from pyspark import SQLContext

sc = pyspark.SparkContext(appName='Fake News Detection')

In [None]:
sql = pyspark.SQLContext(sc)

true_df = sql.read.format("com.databricks.spark.csv").option("header", "True").load("../input/fake-and-real-news-dataset/True.csv")
false_df = sql.read.format("com.databricks.spark.csv").option("header", "True").load("../input/fake-and-real-news-dataset/Fake.csv")

true_df.show()
false_df.show()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import string

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import load_model


seed = 4353

In [None]:
true = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [None]:
true.sample(5)

In [None]:
fake.head()

In [None]:
# Introducing new column in both dataframes

true['category']=1
fake['category']=0

In [None]:
# Concatenating them using pandas concatenate to form a single dataframe

data_raw = pd.concat([true, fake], axis=0)
data_raw.sample(10)

In [None]:
# Combining title and text to obtain a single string
# dropping title and

data_raw['fulltext'] = data_raw.title + ' ' + data_raw.text
data_raw.drop(['title','text'], axis=1, inplace=True)

In [None]:
# Extracting a new dataframe using features fulltext and category
data = data_raw[['fulltext', 'category']]
data = data.reset_index()
data.drop(['index'], axis=1, inplace=True)

In [None]:
# Check for missing values

data_raw.isnull().sum()

In [None]:
print('The dataset contans {} rows and {} columns'.format(data.shape[0], data.shape[1]))

In [None]:
# Filter Subject
plt.figure(figsize =(15,10))
sns.countplot(data_raw['subject'])

In [None]:
# Word extraction from true and fake texts

true_text = data[data.category==1]['fulltext']
fake_text = data[data.category==0]['fulltext']
fake_text = fake_text.reset_index().drop(['index'], axis=1)

In [None]:
# Function to extract major words from true and fake news

def wordcloud_words(X_data_full):
    
    # function for removing punctuations
    def remove_punct(X_data_func):
        string1 = X_data_func.lower()
        translation_table = dict.fromkeys(map(ord, string.punctuation),' ')
        string2 = string1.translate(translation_table)
        return string2
    
    X_data_full_clear_punct = []
    for i in range(len(X_data_full)):
        test_data = remove_punct(X_data_full[i])
        X_data_full_clear_punct.append(test_data)
        
    # function to remove stopwords
    def remove_stopwords(X_data_func):
        pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
        string2 = pattern.sub(' ', X_data_func)
        return string2
    
    X_data_full_clear_stopwords = []
    for i in range(len(X_data_full)):
        test_data = remove_stopwords(X_data_full[i])
        X_data_full_clear_stopwords.append(test_data)
        
    # function for tokenizing
    def tokenize_words(X_data_func):
        words = nltk.word_tokenize(X_data_func)
        return words
    
    X_data_full_tokenized_words = []
    for i in range(len(X_data_full)):
        test_data = tokenize_words(X_data_full[i])
        X_data_full_tokenized_words.append(test_data)
        
    # function for lemmatizing
    lemmatizer = WordNetLemmatizer()
    def lemmatize_words(X_data_func):
        words = lemmatizer.lemmatize(X_data_func)
        return words
    
    X_data_full_lemmatized_words = []
    for i in range(len(X_data_full)):
        test_data = lemmatize_words(X_data_full[i])
        X_data_full_lemmatized_words.append(test_data)
        
    return X_data_full_lemmatized_words

In [None]:
true_words = wordcloud_words(true_text)
fake_words = wordcloud_words(fake_text.fulltext)

In [None]:
def plot_wordcloud(text):
    wordcloud = WordCloud(background_color = 'black',
                         max_words = 3000,
                         width=1600,
                         height=800).generate(text)
    plt.clf()
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
plt.figure(figsize=(20,18))
plot_wordcloud(' '.join(true_words))

In [None]:
plt.figure(figsize=(20,18))
plot_wordcloud(' '.join(fake_words))

In [None]:
datas=pd.concat([true,fake])

In [None]:
datas['fulltext'] = datas.title + ' ' + datas.text
datas.drop(['title','text'], axis=1, inplace=True)

In [None]:
final = data[['fulltext', 'category']]
final = data.reset_index()
final.drop(['index'], axis=1, inplace=True)

In [None]:
#PREPROCESSING

#Removing the repeated data.
#Removing Stop-words
#Remove any punctuations or limited set of special characters like , or . or # etc.
#Snowball Stemming the word
#Convert the word to lowercase.

import re
i=0;
for sent in final['fulltext'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1; 

In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
stop = set(stopwords.words('english')) 
sno = nltk.stem.SnowballStemmer('english')

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#.|,|)|(|\|/]',r'',sentence)
    
    return  cleaned

In [None]:
# PREPROCESSING STEP BY STEP
# this code takes a while to run as it needs to run on 500k sentences.
import re
i=0
str1=' '
final_string=[]
all_true_words=[] # store words from +ve reviews here
all_fake_words=[] # store words from -ve reviews here.
s=''
for sent in final['fulltext'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['category'].values)[i] == '1': 
                        all_true_words.append(s) #list of all words used to describe positive reviews
                    if(final['category'].values)[i] == '0':
                        all_fake_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [None]:
final['CleanedText']=final_string

In [None]:
label=final["category"]
sample=final['CleanedText']

In [None]:
#TRAIN and TEST Split

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(sample, label, test_size=0.30, random_state=0)

In [None]:
#IMPORT

from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
import seaborn as sb
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import gensim
from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics import f1_score

In [None]:
# TFIDF Vectorizer

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
X_train = tf_idf_vect.fit_transform(X_train)
X_test= tf_idf_vect.transform(X_test)

In [None]:
# Model Multinomial NB

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
# Creating alpha values in the range from 10^-4 to 10^4
neighbors = []
i = 0.0001
while(i<=10000):
    neighbors.append(np.round(i,3))
    i *= 3


# empty list that will hold cv scores
cv_scores = []

# perform 10-fold cross validation
for k in neighbors:
    bn = MultinomialNB(alpha = k)
    scores = cross_val_score(bn, X_train, Y_train, cv=10, scoring='f1_macro', n_jobs=-1)
    cv_scores.append(scores.mean())  
    
# determining best value of alpha
optimal_alpha = neighbors[cv_scores.index(max(cv_scores))]
print('\nThe optimal value of alpha is %.3f.' % optimal_alpha)

In [None]:
bn_optimal = MultinomialNB(alpha = optimal_alpha)
bn_optimal.fit(X_train, Y_train)

In [None]:
bn_optimal.classes_

In [None]:
# Now we can find log probabilities of different features for both the classes
class_features = bn_optimal.feature_log_prob_

#  row_0 is for 'Fake' class and row_1 is for 'True' class
Fake_features = class_features[0]
True_features = class_features[1]

# Getting all feature names
feature_names = tf_idf_vect.get_feature_names()

# Sorting 'Fake_features' and 'True_features' in descending order using argsort() function
sorted_Fake_features = np.argsort(Fake_features)[::-1]
sorted_True_features = np.argsort(True_features)[::-1]

print("Top 20 Important Features and their log probabilities For Fake News :\n\n")
for i in list(sorted_Fake_features[0:20]):
    print("%s\t -->\t%f  "%(feature_names[i],Fake_features[i]))
    
print("\n\nTop 20 Important Features and their log probabilities For true news :\n\n")
for i in list(sorted_True_features[0:20]):
    print("%s\t -->\t%f  "%(feature_names[i],True_features[i]))

In [None]:
%time
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint

depths=[1,5,50,100]
estimators=[1,5,50,100]
clf = RandomForestClassifier()

params = {'max_depth' : depths,
          'n_estimators':estimators  
          }

grid = GridSearchCV(estimator = clf,param_grid=params ,cv = 2,n_jobs = 3,scoring='roc_auc')
grid.fit(X_train, Y_train)
print("best depth = ", grid.best_params_)
print("AUC value on train data = ", grid.best_score_*100)
a1 = grid.best_params_

In [None]:
optimal_depth1 = a1.get('max_depth')
optimal_bases1 = a1.get('n_estimators')

In [None]:
clf = RandomForestClassifier(max_depth=optimal_depth1,n_estimators=optimal_bases1) 

clf.fit(X_train,Y_train)

pred = clf.predict(X_test)

In [None]:
# Code for drawing seaborn heatmaps
class_names = ['Fake','True']
df_heatmap = pd.DataFrame(confusion_matrix(Y_test, pred), index=class_names, columns=class_names )
fig = plt.figure(figsize=(10,7))
heatmap = sb.heatmap(df_heatmap, annot=True, fmt="d")

# Setting tick labels for heatmap
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=0, ha='right', fontsize=14)
plt.ylabel('Predicted label',size=18)
plt.xlabel('True label',size=18)
plt.title("Confusion Matrix\n",size=24)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
acc1 = accuracy_score(Y_test, pred) * 100
pre1 = precision_score(Y_test, pred) * 100
rec1 = recall_score(Y_test, pred) * 100
f11 = f1_score(Y_test, pred) * 100
print('\nAccuracy=%f%%' % (acc1))
print('\nprecision=%f%%' % (pre1))
print('\nrecall=%f%%' % (rec1))
print('\nF1-Score=%f%%' % (f11))

In [None]:
# Calculate feature importances from decision trees
importances = clf.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1][:25]

# Rearrange feature names so they match the sorted feature importances
names = tf_idf_vect.get_feature_names()

sb.set(rc={'figure.figsize':(11.7,8.27)})

# Create plot
plt.figure()

# Create plot title
plt.title("Feature Importance")

# Add bars
plt.bar(range(25), importances[indices])

# Add feature names as x-axis labels
names = np.array(names)
plt.xticks(range(25), names[indices], rotation=90)

# Show plot
plt.show()
# uni_gram.get_feature_names()

In [None]:
df=names[indices]
print(df)

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width = 800, height = 600,background_color ='white').generate(str(df))
plt.imshow(wordcloud)
plt.title("Frequent words")
plt.show()