In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import nltk
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [None]:
#df = pd.read_pickle('../saved_files/cleaned_df.pkl')
df_fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
df_true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

In [None]:
df_fake.head()

In [None]:
df_true.head()

In [None]:
plt.plot(param_C,df_acc1.mean_test_score.astype(float), marker = 'o')
Title = 'Model Accuracy vs C Parameter with l1 Regulation'
Xlab = "log10 of C"
Ylab = "Accuracy"
plt.title(Title)
plt.xlabel(Xlab)
plt.ylabel(Ylab)
plt.show()

In [None]:
df_true['label'] = 0
df_fake['label'] = 1

In [None]:
df = pd.concat([df_true, df_fake],axis=0)

In [None]:
df = df.sample(frac = 1).reset_index(drop=True)

In [None]:
df.head()

In [None]:
# Preparing the target and predictors for modeling
# Keep the title and body text separated for different models
#X_body_text = df['clean_text'].values
#X_title_text = df['clean_title'].values
#y = df['label'].values

In [None]:
np.unique(df['subject'],return_counts = True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)

In [None]:
plt.rcParams['figure.figsize'] = (14.0, 6.0)
plt.rcParams['font.family'] = "serif"

In [None]:
news_cat_count = sns.countplot(data=df, x = 'subject')
#news_cat_count.axes.set_title("Title",fontsize=30)
#news_cat_count.set_xlabel("Subject",fontsize=12)
news_cat_count.set_ylabel("Count",fontsize=12)
news_cat_count.tick_params(labelsize=12)
#sns.plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (8.0, 4.0)
plt.rcParams['font.family'] = "serif"
news_true_cat_count = sns.countplot(data=df_true,x = 'subject')

In [None]:
plt.rcParams['figure.figsize'] = (9.0, 4.0)
plt.rcParams['font.family'] = "serif"
news_fake_cat_count = sns.countplot(data=df_fake, x = 'subject')

In [None]:
df[['title', 'label']].groupby('label').agg('count')

In [None]:
df['text_len']=df['text'].apply(len)

In [None]:
df.hist(column='text_len',bins=50,figsize=(8,5),grid=False)

In [None]:
df.hist(column='text_len',by='label',bins=50,figsize=(14,4))

In [None]:
import re 
# using regex (findall()) 
# to count words in string 
df['num_words'] = df['text'].apply(lambda x: len(re.findall(r'\w+', x)))

In [None]:
df.sort_values(by='num_words',ascending = False)['num_words'][:1000]

In [None]:
#df.sort_values(by='num_words',ascending = False)
df[df['label']==0].hist(column='num_words',bins=50,figsize=(8,6),xlabelsize=12, ylabelsize=12)
plt.title('Real News', fontdict=None, loc='center', pad=None, fontsize=16)
plt.xlabel("Number of Words", fontsize=14)
plt.ylabel("Number of Articles",fontsize=14)
plt.xlim([0,2000])
plt.ylim([0,6000])

In [None]:
#df.sort_values(by='num_words',ascending = False)
df[df['label']==1].hist(column='num_words',bins=80,figsize=(8,6),xlabelsize=12, ylabelsize=12)
plt.title('Fake News', fontdict=None, loc='center', pad=None, fontsize=16)
plt.xlabel("Number of Words", fontsize=14)
plt.ylabel("Number of Articles",fontsize=14)
plt.xlim([0,2000])
plt.ylim([0,6000])

In [None]:
df2 = df.copy()

In [None]:
df2.head()

In [None]:
news_text = df2['text']

In [None]:
print(news_text[1])

In [None]:
#use regular expression to replace some specific text
# replace email address
processed_text = news_text.str.replace(r'^([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)$', 'email_address')

#replace 10 digit phone number
#processed_text = processed_text.str.replace(r'^[^0-9]*(?:(\d)[^0-9]*){10}$', 'phone_number')

#replace normal number with numbr
#processed_text = processed_text.str.replace(r'\d+(\.\d+)?', 'numbr')

#remove punctuation
processed_text = processed_text.str.replace(r'[^\w\d\s]', ' ')

#remove whitespace between terms with a single space
processed_text = processed_text.str.replace(r'\s+', ' ')

#remove leading and trailing whitespace
processed_text = processed_text.str.replace(r'^\s+|\s+?$', '')

In [None]:
print(processed_text[1])

In [None]:
processed_text = processed_text.str.lower()

In [None]:
print(processed_text[1])

In [None]:
#remove stopwords from text
#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
              
stop_words = set(stopwords.words('english'))
processed_text = processed_text.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [None]:
print(processed_text[1])

In [None]:
text_before_stemming  = processed_text

In [None]:
porter_stemmer = nltk.PorterStemmer()
processed_text = processed_text.apply(lambda x: ' '.join(porter_stemmer.stem(term) for term in x.split()))

In [None]:
print(processed_text[1])

In [None]:
print(processed_text.shape)
processed_text.head()

In [None]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

#creating a bag of words
all_words =[]

for news in processed_text:
    words = word_tokenize(news)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [None]:
#word_tokenize(processed_text[1])

In [None]:
all_words.plot(20,cumulative=False)

In [None]:
#print the total number of words and the 20 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(20)))

In [None]:
#use the 2000 most common words as feature
word_features = list(all_words.keys())[:2000]

In [None]:
#word_features
def find_features(news):
    words = word_tokenize(news)
    features = {}
    for word in word_features:
        features[word]=(word in words)
        
    return features

In [None]:
processed_text[1]

In [None]:
features = find_features(processed_text[1])
for key, value in features.items():
    if value == True:
        print (key)

In [None]:
features

In [None]:
y = df2['label']

In [None]:
#find features for all news
news = list(zip(processed_text, y))

#define a seed for reproducibility
seed = 12
np.random.seed = seed
#np.random.shuffle(news)

#call find_features function for each news article
featuresets = [(find_features(text), label) for (text, label) in news]

In [None]:
from sklearn import model_selection
training, testing = model_selection.train_test_split(featuresets, \
                                                    test_size=0.25,random_state = seed)

In [None]:
print(np.shape(featuresets))
#featuresets[3]

In [None]:
np.shape(news)

In [None]:
df_news = pd.DataFrame(news, columns=['text', 'label'])

In [None]:
df_news.head()

## Scikit_learn Classifiers with NLTK

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
#list sklearn models to train
model_names = ['KNeighbors','Decision Tree', 'Random Forest',\
             'Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']
sklearn_classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(model_names, sklearn_classifiers))
print(models)

In [None]:
#wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier
import time

for model_name, model in models:
    start = time.time()
    
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print ('{}: Accuracy: {}'.format(model_name, accuracy))
    
    stop = time.time()
    print("Model run time: {}s".format(stop - start))

In [None]:
# ensemble method - Voting classifier
from sklearn.ensemble import VotingClassifier

#list models to train
model_names = ['Decision Tree', 'Random Forest','Logistic Regression',\
               'SGD Classifier','Naive Bayes','SVM Linear']
sklearn_classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(model_names, sklearn_classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models,\
                                    voting = 'hard',n_jobs = 4))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble,testing)*100
print('Ensemble Method Accuracy: {}'.format(accuracy))

In [None]:
# predict class label for testing dataset
text_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(text_features)

In [None]:
# print a classification report and a confusion matrix
print(classification_report(labels, prediction))

pd.DataFrame(confusion_matrix(labels, prediction),
            index = [['actual','actual'], ['Positive','Negative']],
            columns = [['predicted','predicted'], ['Positive','Negative']])

### Word Counts and Wordcloud

In [None]:
y = df2['label']
news_before_stemming = list(zip(text_before_stemming, y))

In [None]:
df_news = pd.DataFrame(news_before_stemming, columns = ['text','label'])

In [None]:
fake_news = df_news[df_news['label']==1]

In [None]:
fake_news.head()

In [None]:
import nltk
nltk.download('punkt')

In [None]:
fake_news_words = nltk.word_tokenize(" ".join(fake_news['text'].values.tolist()))

In [None]:
fake_counter = Counter(fake_news_words)
print(fake_counter.most_common(50))

In [None]:
fake_wordcloud = WordCloud(width=800, height=800, random_state = 42).generate(" ".join(fake_news_words))

fig = plt.figure(figsize=(8,8), facecolor = 'k')
plt.imshow(fake_wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
fake_bigrams = nltk.bigrams(fake_news_words)

In [None]:
fake_counter = Counter(fake_bigrams)
print(fake_counter.most_common(10))

In [None]:
#true news
true_news = df_news[df_news['label']==0]
true_news.head()

In [None]:
true_news_words = nltk.word_tokenize(" ".join(true_news['text'].values.tolist()))

In [None]:
true_news_counter = Counter(true_news_words)
print(true_news_counter.most_common(50))

In [None]:
true_news_wordcloud = WordCloud(width=800, height=800, random_state = 42).generate(" ".join(true_news_words))

fig = plt.figure(figsize=(8,8), facecolor = 'k' )
plt.imshow(true_news_wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
true_news_bigrams = nltk.bigrams(true_news_words)

In [None]:
true_news_counter = Counter(true_news_bigrams)
print(true_news_counter.most_common(10))