In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# Preprocessing Steps

1. Prepare dataset: add class to indicate if the piece is real or fake news

2. Tokenization to create a bag of words: remain only alphabetical lower case words

3. Lemmatization/Stemming: shorten words

4. Removing stopwords

## Prepare dataset

In [2]:
fake = pd.read_csv('Fake.csv')
real = pd.read_csv('True.csv')
fake['class'] = 1
real['class'] = 0
data = pd.concat([fake, real], axis=0).reset_index()

In [3]:
data.head(5)

Unnamed: 0,index,title,text,subject,date,class
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


## Train Test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'], test_size=0.33, random_state=7406)

## Preprocessing

1. lowercase
2. remain only words, not numbers

In [5]:
def preprocessing(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    return text

### Word Vectorization using count

In [6]:
# define stopwords
stop_words = ENGLISH_STOP_WORDS
# create CountVectorizer class
count_vectorizer = CountVectorizer(max_features=7000, stop_words = stop_words, preprocessor = preprocessing)
# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)

In [7]:
count_df = pd.DataFrame(count_train.toarray(), columns=count_vectorizer.get_feature_names())

In [8]:
count_df.head(5)

Unnamed: 0,aaron,abadi,abandon,abandoned,abandoning,abbas,abbott,abc,abdel,abdullah,...,zealand,zero,zika,zimbabwe,zinke,zone,zones,zor,zuckerberg,zuma
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Word Vectorization using Term Frequency - Inverse Document Frequency

In [9]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=7000, stop_words = stop_words, preprocessor = preprocessing)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [10]:
tfidf_df = pd.DataFrame(tfidf_train.toarray(), columns=tfidf_vectorizer.get_feature_names())

In [11]:
tfidf_df.head(5)

Unnamed: 0,aaron,abadi,abandon,abandoned,abandoning,abbas,abbott,abc,abdel,abdullah,...,zealand,zero,zika,zimbabwe,zinke,zone,zones,zor,zuckerberg,zuma
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.161358,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.12795,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042478,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### check if the count vectorization returns same features as tfidf

In [12]:
# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

set()


## Dimensionality Reduction & Classifiers on Count

In [16]:
lg_pipe = Pipeline([('svd', TruncatedSVD(n_components=100)), ('lg', LogisticRegression(max_iter=500))])
lg_pipe.fit(count_train, y_train)
pred_lg_count = lg_pipe.predict(count_test)
lg_count_acc = accuracy_score(y_test, pred_lg_count)

In [18]:
lg_count_acc

0.9784031855301343

In [21]:
svm_pipe = Pipeline([('svd', TruncatedSVD(n_components=100)), ('svm', SVC(random_state=123))])
svm_pipe.fit(count_train, y_train)
pred_svm_count = svm_pipe.predict(count_test)
svm_count_acc = accuracy_score(y_test, pred_svm_count)

In [22]:
svm_count_acc

0.9826550583788891

In [23]:
rf_pipe = Pipeline([('svd', TruncatedSVD(n_components=100)), ('rf', RandomForestClassifier())])
rf_pipe.fit(count_train, y_train)
pred_rf_count = rf_pipe.predict(count_test)
rf_count_acc = accuracy_score(y_test, pred_rf_count)

In [24]:
rf_count_acc

0.9599784031855302

In [None]:
knn_pipe = Pipeline([('svd', TruncatedSVD(n_components=100)), ('knn', KNeighborsClassifier(n_neighbors=50))])
knn_pipe.fit(count_train, y_train)
pred_knn_count = knn_pipe.predict(count_test)
knn_count_acc = accuracy_score(y_test, pred_knn_count)

In [None]:
knn_count_acc

## Dimensionality Reduction & Classifiers on tfidf

In [25]:
lg_pipe = Pipeline([('svd', TruncatedSVD(n_components=100)), ('lg', LogisticRegression(max_iter=500))])
lg_pipe.fit(tfidf_train, y_train)
pred_lg_count = lg_pipe.predict(tfidf_test)
lg_tfidf_acc = accuracy_score(y_test, pred_lg_count)

In [26]:
lg_tfidf_acc

0.9680097185665114

In [27]:
svm_pipe = Pipeline([('svd', TruncatedSVD(n_components=100)), ('svm', SVC())])
svm_pipe.fit(tfidf_train, y_train)
pred_svm_count = svm_pipe.predict(tfidf_test)
svm_tfidf_acc = accuracy_score(y_test, pred_svm_count)

In [28]:
svm_tfidf_acc

0.9780657353040426

In [29]:
rf_pipe = Pipeline([('svd', TruncatedSVD(n_components=100)), ('rf', RandomForestClassifier())])
rf_pipe.fit(tfidf_train, y_train)
pred_rf_count = rf_pipe.predict(tfidf_test)
rf_tfidf_acc = accuracy_score(y_test, pred_rf_count)

In [30]:
rf_tfidf_acc

0.96321792535601

In [34]:
model_compare = pd.DataFrame([[lg_count_acc, svm_count_acc, rf_count_acc], \
                              [lg_tfidf_acc, svm_tfidf_acc, rf_tfidf_acc]],\
                              columns=['Logistic Regression', 'SVM', 'Random Forest'], \
                              index = ['count', 'tfidf'])
model_compare

Unnamed: 0,Logistic Regression,SVM,Random Forest
count,0.978403,0.982655,0.959978
tfidf,0.96801,0.978066,0.963218


In [None]:
'''plot_confusion_matrix(nb_classifier, count_test, y_test, cmap=plt.cm.Blues)
plt.show()'''

In [None]:
'''# test on sample fake and real news
np.random.seed(7406)

sample_text_real = np.random.choice(data.loc[data['class'] == 0]['text'], 1)[0]'''

In [None]:
'''sample_text_fake_token = word_tokenize(sample_text_fake)
# Retain alphabetic lower case words: alpha_only
alpha_only_fake = [t.lower() for t in sample_text_fake_token if t.isalpha()]

# Remove all stop words: no_stops
no_stops_fake = [t for t in alpha_only_fake if t not in stopwords.words('english')]
Counter(no_stops_fake)'''