## Data Cleaning

### Imports

In [None]:
import pandas as pd
import numpy as np
import nltk
import os
import re
import nltk.corpus
from nltk import word_tokenize
# Importing Lemmatizer library from nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#importing chunk library from nltk
#from nltk import ne_chunk
from string import punctuation

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


### Load the data

In [None]:
true_news = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fake_news = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [None]:
true_news['label'] = 'TRUE'
fake_news['label'] = 'FAKE'

In [None]:
true_news.info()
fake_news.info()

In [None]:
df = pd.concat([true_news,fake_news])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

### Removing tags

In [None]:
#function to remove tags
def remove_tags(string):
    result = re.sub('<.*?>','',string)
    return result
#remove tags
df['title'] = df['title'].apply(lambda x : remove_tags(x))
df['text'] = df['text'].apply(lambda x : remove_tags(x))
df['subject'] = df['subject'].apply(lambda x : remove_tags(x))

### Removing number

In [None]:
#remove number
def remove_number(data):
    output = ''.join(c for c in data if not c.isnumeric())
    return output

df['title']=df['title'].apply(lambda x : remove_number(x))
df['text']=df['text'].apply(lambda x : remove_number(x))
df['subject']=df['subject'].apply(lambda x : remove_number(x))

### Removing contractions

In [None]:
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

In [None]:
# remove contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)
df['title'] = df['title'].apply(lambda x:expand_contractions(x))
df['text'] = df['text'].apply(lambda x:expand_contractions(x))
df['subject'] = df['subject'].apply(lambda x:expand_contractions(x))

### Removing white spaces

In [None]:
#remove white spaces
df['title'] = df['title'].apply(lambda x: re.sub(' +',' ',x))
df['text'] = df['text'].apply(lambda x: re.sub(' +',' ',x))
df['subject'] = df['subject'].apply(lambda x: re.sub(' +',' ',x))

### Lemmatization with stopwords removal and tokenization

In [None]:
stopwords = stopwords.words('english')
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in word_tokenize(text) if w not in stopwords]

df['title'] = df['title'].apply(lemmatize_text)
df['text'] = df['text'].apply(lemmatize_text)
df['subject'] = df['subject'].apply(lemmatize_text)

### Converting to lowercase

In [None]:
#changing to lowercase
df['title'] = df['title'].astype(str).str.lower()
df['text'] = df['text'].astype(str).str.lower()
df['subject'] = df['subject'].astype(str).str.lower()

### Rrmoving punctuation

In [None]:
#remove punctuation
def strip_punctuation(s):
    return ''.join(c for c in s if c not in punctuation)
df['title'] = df['title'].apply(lambda x: strip_punctuation(x))
df['text'] = df['text'].apply(lambda x: strip_punctuation(x))
df['subject'] = df['subject'].apply(lambda x: strip_punctuation(x))
df.head()

In [None]:
df.to_csv('cleaned_data.csv',index=False)

In [None]:
d = pd.read_csv('cleaned_data.csv')
d.info()
df.info()

## Exploratory Data Analysis

### Imports

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  
import nltk.corpus
import seaborn as sns
from matplotlib import rcParams
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer, TfidfVectorizer

### Load the cleaned Data<a id='3.4_Load the Data'></a>

In [None]:
df = pd.read_csv('/output/kaggle/working/cleaned_data.csv',index_col=None)

### Summary Statistics<a id='3.4.1_Summary Statistics'></a>

In [None]:
df.describe()

### Remove NaN values<a id='3.4.2_Remove_NaN_Values'></a>

In [None]:
df.label.value_counts()

### Map categorical features to numeric values<a id='3.5.1_Map_categorical_features_to_numeric_values'></a>

In [None]:
df['label'] = df['label'].map({'FAKE':0, 'TRUE':1, np.nan:2} )

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

### Explore the data<a id='3.5_Explore_the_data'></a>

In [None]:
df.head()

In [None]:
def generate_word_cloud(text):
    wordcloud = WordCloud(
        width = 1000,
        height = 500,
        background_color = 'black').generate(str(text))
    fig = plt.figure(
        figsize = (40, 30),
        facecolor = 'k',
        edgecolor = 'k')
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    plt.show()

### WordCloud by True Label<a id='3.5.1_WordCloud_by_True_Label'></a>

In [None]:
df_true = df[df['label'] == 1 ]

generate_word_cloud(df_true)

### WordCloud by Fake Label<a id='3.5.2_WordCloud_by_Fake Label'></a>

In [None]:
df_fake = df[df['label'] == 0 ]

generate_word_cloud(df_fake)

### Histograms of True and Fake labels in the data<a id='3.5.3_Histograms_of_True_and_Fake_labels_in_the_data'></a>

In [None]:
plt.figure(figsize=(7, 7))
sns.set(style="darkgrid")

color = sns.color_palette("Set2")
ax = sns.countplot(x="label", data=df, palette=color)

ax.set(xticklabels=df.label.unique())

plt.title("Data distribution of fake and true news")
plt.show()

### Pie-chart showing various data sources<a id='3.5.4_Pie-chart_showing_various_data_sources'></a>

In [None]:
slices = df.subject.value_counts().to_list()
label = df.subject.unique()
explode = (0.1, 0.1, 0.1, 0,0,0,0,0) 
plt.pie(slices, labels = label, startangle = 30, shadow = True, explode=explode,autopct='%1.1f%%')
plt.title('Pie Chart of News Subject')
plt.show()

### Number of characters in each sentence<a id='3.5.5_Number_of_characters_in_each_sentence'></a>

Here we will explore the number of characters in each sentence and analyze if there is a difference in the character count for Fake and True news. 

In [None]:
df_true['title'].str.len().hist(bins=10,range=[20,200])
plt.show()

In [None]:
df_fake['title'].str.len().hist(bins=10,range=[20,200])
plt.show()

In [None]:
df_true['text'].str.len().hist(bins=10,range=[0,30000])
plt.show()

In [None]:
df_fake['text'].str.len().hist(bins=10,range=[0,30000])
plt.show()

### Average length of word in each sentence<a id='3.5.6_Average_length_of_word_in_each_sentence'></a>

In [None]:
df_true['title'].str.split().\
   apply(lambda x : [len(i) for i in x]). \
   map(lambda x: np.mean(x)).hist(bins=10, range=[3.0, 10])
plt.show()

In [None]:
###try binning it not based on max min and make the plots similar for comparison
df_fake['title'].str.split().\
   apply(lambda x : [len(i) for i in x]). \
   map(lambda x: np.mean(x)).hist(bins=10, range=[3.0, 10])
plt.show()

In [None]:
df_true['text'].str.split().\
   apply(lambda x : [len(i) for i in x]). \
   map(lambda x: np.mean(x)).hist(bins=10, range=[3.5, 6.5])
plt.show()

In [None]:
df_fake['text'].str.split().\
   apply(lambda x : [len(i) for i in x]). \
   map(lambda x: np.mean(x)).hist(bins=10, range=[3.5, 6.5])
plt.show()

### Visualizing high dimensional data<a id='3.6_Visualizing_high_dimensional_data'></a>

### Creating unigrams from vectorized data<a id='3.6.1_Creating _unigrams_from_vectorized_data'></a>

In [None]:
#top 20 unigrams
word_vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df_true['text'])
frequencies = sum(sparse_matrix).toarray()[0]
word = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
word = word.nlargest(columns = "frequency", n = 20)
plt.figure(figsize=(12,8))
ax = sns.barplot(data = word, y = word.index, x = "frequency", palette=("Blues_d"))
sns.set_context("poster")
ax.set(ylabel = "Count")
plt.show()

In [None]:
#top 20 unigrams
word_vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df_fake['text'])
frequencies = sum(sparse_matrix).toarray()[0]
word = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
word = word.nlargest(columns = "frequency", n = 20)
plt.figure(figsize=(12,8))
ax = sns.barplot(data = word, y = word.index, x = "frequency", palette=("Reds_r"))
sns.set_context("poster")
ax.set(ylabel = "Count")
plt.show()

### Creating bigrams from vectorized data<a id='3.6.2_Creating _bigrams_from_vectorized_data'></a>

In [None]:
##top 20 bigrams
word_vectorizer = CountVectorizer(ngram_range=(2,2), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df_true.text)
frequencies = sum(sparse_matrix).toarray()[0]
word = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
word = word.nlargest(columns = "frequency", n = 20)
plt.figure(figsize=(12,8))
ax = sns.barplot(data = word, y = word.index, x = "frequency", palette=("Blues_d"))
sns.set_context("poster")
ax.set(ylabel = "Count")
plt.show()

In [None]:
##top 20 bigrams
word_vectorizer = CountVectorizer(ngram_range=(2,2), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df_fake.text)
frequencies = sum(sparse_matrix).toarray()[0]
word = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
word = word.nlargest(columns = "frequency", n = 20)
plt.figure(figsize=(12,8))
ax = sns.barplot(data = word, y = word.index, x = "frequency", palette=("Reds_d"))
sns.set_context("poster")
ax.set(ylabel = "Count")
plt.show()

### Creating trigrams from vectorized data<a id='3.6.3_Creating _trigrams_from_vectorized_data'></a>

In [None]:
##top 20 trigrams
word_vectorizer = CountVectorizer(ngram_range=(3,3), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df_true.text)
frequencies = sum(sparse_matrix).toarray()[0]
word = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
word = word.nlargest(columns = "frequency", n = 20)
plt.figure(figsize=(12,8))
ax = sns.barplot(data = word, y = word.index, x = "frequency", palette=("Blues_d"))
sns.set_context("poster")
ax.set(ylabel = "Count")
plt.show()

In [None]:
##top 20 trigrams
word_vectorizer = CountVectorizer(ngram_range=(3,3), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df_fake.text)
frequencies = sum(sparse_matrix).toarray()[0]
word = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
word = word.nlargest(columns = "frequency", n = 20)
plt.figure(figsize=(12,8))
ax = sns.barplot(data = word, y = word.index, x = "frequency", palette=("Reds_d"))
sns.set_context("poster")
ax.set(ylabel = "Count")
plt.show()

### EDA Summary<a id='3.7_Summary'></a>

The dataset has a balanced number of both TRUE and FAKE classes.  Few things can be observed in the text that demarcates the tow labels:
 * Number of characters in 'title' for True labels are mostly between 50 to 100. For Fake labels this has a wide range of variations suggesting that data are collected from different sources. 
 * Average length of sentences are very similar for both classes.
 
Looking at the n-grams and the word cloud it is evident that the news mostly contains political content and president Trump is most common across both labels.

# Pre-Processing and Training Data<a id='4_Pre-Processing_and_Training_Data'></a>

### Imports<a id='4.3_Imports'></a>

In [None]:
import pandas as pd
import numpy as np
from nltk.util import ngrams
import nltk.corpus
from stop_words import get_stop_words

from matplotlib import rcParams
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.impute import SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, precision_recall_curve
import time
from sklearn import __version__ as sklearn_version
from sklearn.pipeline import make_pipeline
import os, time
from sklearn.pipeline import Pipeline

### Load the cleaned Data<a id='4.4_Load the Data'></a>

In [None]:
df = pd.read_csv('/output/kaggle/working/cleaned_data.csv',index_col=None)

In [None]:
df.describe()

### Map categorical features to numeric values<a id='4.4.1_Map_categorical_features_to_numeric_values'></a>

In [None]:
df['label'] = df['label'].map({'FAKE':0, 'TRUE':1, np.nan: 2} )

In [None]:
df.isnull().sum()

### Remove Missing Values<a id='4.4.1_Remove_missing_values'></a>

In [None]:
df = df.dropna()

### Feature Engineering with Count Vectorizer and Tfidf Vectorizer<a id='4.5_Feature_Engineering_with_Count_Vectorizer_and_Tfidf_Vectorizer'></a>

In [None]:
df['allwords'] = df['title']+" "+df['text']
df_all = df[['allwords','label']]

In [None]:
df_all = df_all.drop_duplicates(subset=['allwords', 'label'], keep=False)

In [None]:
df_all.shape

In [None]:
df_all.label.value_counts()

In [None]:
true = df_all[df_all['label']==1].sample(n=12530, random_state=42)
fake = df_all[df_all['label']==0]

In [None]:
df_all = pd.concat([true,fake],ignore_index=True)
df_all = df_all.sample(frac=1).reset_index(drop=True)
df_all.head()

In [None]:
df_all.label.value_counts()

### Train/Test Split<a id='4.6_Train/Test_Split'></a>

In [None]:
y = df_all['label']

In [None]:
selected_words = ['reuter','reuters','reutersus','image','via']
f = lambda x: ' '.join([item for item in x.split() if item not in selected_words])
X = df_all["allwords"].apply(f)

### Stopwords removal<a id='4.6_Stopwords_removal'></a>

In [None]:
X = [word for word in X if word not in get_stop_words('english')]

In [None]:
X_train, X_test, y_train,y_test= train_test_split(X, y, test_size=0.30,random_state=42,stratify=y)

In [None]:
columns = ['Model Name', 'accuracy','precision','recall','ROC AUC score','run time']
results = pd.DataFrame(columns=columns)

### Models and Metrics<a id='4.7_Models_and_Metrics'></a>

### Metrics<a id='4.7.1_Metrics'></a>

In classification task there are a number of things that we can look to understand how good a model is performing. Precision, recall, f1 score, and ROC-AUC score are typically used metrics for classification problems. `sklearn.metrics` provides many commonly used metrics, included the ones mentioned here.

In [None]:
 def metrics(model_name,y_test,y_pred):
    accuracy = accuracy_score(y_test,y_pred)
    roc_auc =roc_auc_score(y_test, y_pred)
    precision = precision_score(y_pred=y_pred, y_true=y_test,zero_division=1)
    recall = recall_score(y_pred=y_pred, y_true=y_test,zero_division=1)
    
    print(classification_report(y_test, y_pred,zero_division=1))

    cf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(cf_matrix, annot=True,fmt='3', cmap='Blues')
    plt.xlabel('Predicted user status',fontsize=12)
    plt.ylabel('True user status',fontsize=12)
    plt.title('%s Confusion Matrix' % model_name,fontsize=20)
    plt.show()

    fpr, tpr, threshold = roc_curve(y_test, y_pred)
    plt.plot([0,1], [0,1], 'k--')
    plt.plot(fpr, tpr, label=model_name)
    plt.xlabel('False Positive Rate',fontsize=12)
    plt.ylabel('True Positive Rate',fontsize=12)
    plt.title('%s ROC Curve'% model_name,fontsize=20)
    plt.legend(fontsize=12)
    plt.show()
    #pipes = pipes.append(pipe)
    #return pd.DataFrame([[model_name,accuracy, precision, recall,roc_auc,t1]],columns=columns)

### Dummy Model<a id='4.7.2_Dummy_Model'></a>

In [None]:
model_name = 'Dummy Model'
       
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='word',lowercase=False,ngram_range=(2,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', DummyClassifier()),
])

parameters = {
    'vect__max_df': (0.25,0.5, 0.75),
    # 'vect__max_features': (None, 5000, 10000, 50000),
}


if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    t0 = time.time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time.time() - t0))
    print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name])) 

y_pred = grid_search.predict(X_test)
results = results.append(metrics(model_name,y_test,y_pred))

### MultiNomial Naive Bayes<a id='4.7.3_MultiNomial_Naive_Bayes'></a>

In [None]:
model_name = 'Naive Bayes'
       
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='word',lowercase=False,ngram_range=(2,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
    'vect__max_df': (0.25,0.5, 0.75),
    # 'vect__max_features': (None, 5000, 10000, 50000),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    t0 = time.time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time.time() - t0))
    print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name])) 

y_pred = grid_search.predict(X_test)
results = results.append(metrics(model_name,y_test,y_pred))

### Logistic Regression<a id='4.7.4_Logistic_Regression'></a>

In [None]:
model_name = 'Logistic Regression'
       
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='word',lowercase=False,ngram_range=(2,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])

parameters = {
    'vect__max_df': (0.25, 0.5, 0.75,),
    'clf__penalty': ('l1','l2'),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    t0 = time.time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time.time() - t0))
    print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name])) 

y_pred = grid_search.predict(X_test)
results = results.append(metrics(model_name,y_test,y_pred))

### Stochastic Gradient Descent<a id='4.7.5_Stochastic_Gradient_Descent'></a>

In [None]:
model_name = 'Stochastic Gradient Descent'
       
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='word',lowercase=False,ngram_range=(2,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

parameters = {
    'vect__max_df': (0.25,0.5, 0.75),
    'clf__penalty': ('elasticnet',),
    'clf__loss': ('log',)
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    t0 = time.time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time.time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name])) 

y_pred = grid_search.predict(X_test)
results = results.append(metrics(model_name,y_test,y_pred))

### Random Forest Classifier<a id='4.7.6_Random_Forest_Classifier'></a>

In [None]:
model_name = 'Random Forest'
       
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='word',lowercase=False, ngram_range=(2,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])

parameters = {
    'vect__max_df': (0.25,0.5, 0.75),
    'clf__criterion': ('gini','entropy'),
    'clf__max_features': ('auto', 'sqrt'),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    t0 = time.time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time.time() - t0))
    print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name])) 

y_pred = grid_search.predict(X_test)
results = results.append(metrics(model_name,y_test,y_pred))

### Data quantity assessment<a id='4.9_Data_quantity_assessment'></a>

In [None]:
fractions = [0.25, 0.35, 0.5, 0.75, 1.0]
train_size, train_scores, test_scores = learning_curve(grid_search.best_estimator_, X_train, y_train, train_sizes=fractions)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

In [None]:
plt.subplots(figsize=(10, 5))
plt.errorbar(train_size, test_scores_mean, yerr=test_scores_std)
plt.xlabel('Training set size')
plt.ylabel('CV scores')
plt.title('Cross-validation score as training set size increases');

### Save best model object from pipeline<a id='4.10_Save_best_model_object_from_pipeline'></a>

In [None]:
#import sklearn.externals.joblib as extjoblib
import joblib
import pickle
joblib.dump(grid_search.best_estimator_, 'best_model_version1.2.pkl', compress = 1)

### Feature Importance

In [None]:
pipeline = joblib.load('best_model_version1.2.pkl')

In [None]:
imps = pipeline.named_steps['clf'].feature_importances_
fi = {'Importance':imps}
Importance = pd.DataFrame(fi,index=None).sort_values('Importance',ascending=False).head(25)

In [None]:
a = pipeline.named_steps['vect'].get_feature_names()

In [None]:
index = []
for i in Importance.index:
    term = a[i]
    index.append(term)

In [None]:
terms = pd.DataFrame({'Term': index, 'Position': Importance.index,'Importance': Importance.Importance})

In [None]:
plt.figure(figsize=(15,8))
_ = sns.barplot(y = 'Term', x='Importance',data=terms)
plt.xlabel('features')
plt.ylabel('importance')
plt.title('Best random forest regressor feature importances');







### Model Summary<a id='4.11_Summary'></a>

From the 4 models it is observed that the exception of Naive Bayes all of them have very few mis-classification. The ROC_AUC score is 0.964 which is exceptionally well for text classification.  
    
   * The top three models are Logistic Regression, Stochastic Gradient Descent and Random Forest

   * Random Forest, SGD and Logistic Regression has comparable ROC-AUC score. Although Logistic Regression has the highest ROC-AUC score it trained only a max_df = 0.25. This means the modeled ignored terms that have a document frequency strictly lower than the given threshold. However, Random Forest is more scalable, and interpretable and also performs better with noisy data. Considering these, we chose Random Forest as the best model. 

   * Feature Importance of the model shows that 'president Donald Trump', Washington' 'President Obama' were given the highest importance. This is in alignment with the fact that this dataset is indeed a repersentation of the news during the 2016 US Presidential Election.

Although the model performance is very good, looking at both the feature importance of the model the n-gram analysis from EDA  it can be inferred that the data is biased towards US Presidential Election and any news outside this scope might be difficult for the model to predict. We need a bigger dataset covering a wide range of news for both TRUE and FAKE labels to make a more generalized model. 