In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Preprocessing Steps

1. Prepare dataset: add class to indicate if the piece is real or fake news

2. Tokenization to create a bag of words: remain only alphabetical lower case words

3. Lemmatization/Stemming: shorten words

4. Removing stopwords

## Prepare dataset

In [2]:
fake = pd.read_csv('Fake.csv')
real = pd.read_csv('True.csv')
fake['class'] = 1
real['class'] = 0
data = pd.concat([fake, real], axis=0).reset_index()

In [3]:
data.head(5)

Unnamed: 0,index,title,text,subject,date,class
0,0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [24]:
# test on sample fake and real news
np.random.seed(7406)
sample_text_fake = np.random.choice(data.loc[data['class'] == 1]['text'], 1)[0]
sample_text_real = np.random.choice(data.loc[data['class'] == 0]['text'], 1)[0]

In [7]:
sample_text_fake

'The contrast between the Obama White House and the Trump White House seems to grow more and more stark every single day. While you d struggle to find a real scandal during President Obama s entire two terms, it would be nearly impossible to count the number of scandals currently plaguing the Trump White House.More than that, though, it seems that the staff has given up trying to be professional. During a conference call on Monday, about one of the most pressing issues in international relations, Iran, staffers were heard in the background chatting about things like lesbian blowup dolls. I kid you not.Two reporters, Tara McKelvey of the BBC and Jackie Alemany of CBS both tweeted about the background chatter.During the briefing call on Iran, you could hear an ad in the background: a man seemed to be saying  inflatable dolls  and  a lesbian.  Tara McKelvey (@Tara_Mckelvey) July 18, 2017I may never know what was really said during that call or if I heard things wrong. But I ll certainly r

In [8]:
sample_text_real

'KIGALI (Reuters) - A critic of Rwandan President Paul Kagame will appear in court on Friday charged with inciting insurrection and forgery, the prosecutor s office said on Thursday.  Diane Shima Rwigara, a 35-year-old accountant, is the latest political opponent of Kagame to face criminal charges. She was barred from contesting the presidency in an August election he won with 98.8 percent of the vote.  She has repeatedly accused him of stifling dissent and criticized his Rwandan Patriotic Front s tight grip on the country since it fought its way to power to end a genocide that killed more than 800,000 people in 1994.  The Court will examine the serious grounds justifying provisional detention of the suspects,  the prosecutor s office said on Twitter. The office noted that the other two suspects to appear in court are Rwigara s mother Adeline and sister Anne.    The three women have been in detention for around two weeks. They were first taken from their home in the Rwandan capital on 

In [15]:
sample_text_fake_token = word_tokenize(sample_text_fake)

In [21]:
# Retain alphabetic lower case words: alpha_only
alpha_only_fake = [t.lower() for t in sample_text_fake_token if t.isalpha()]

# Remove all stop words: no_stops
no_stops_fake = [t for t in alpha_only_fake if t not in stopwords.words('english')]

In [22]:
Counter(no_stops_fake)

Counter({'contrast': 1,
         'obama': 2,
         'white': 3,
         'house': 2,
         'trump': 3,
         'seems': 2,
         'grow': 1,
         'stark': 1,
         'every': 1,
         'single': 1,
         'day': 1,
         'struggle': 1,
         'find': 1,
         'real': 1,
         'scandal': 1,
         'president': 1,
         'entire': 1,
         'two': 1,
         'terms': 1,
         'would': 1,
         'nearly': 1,
         'impossible': 1,
         'count': 1,
         'number': 1,
         'scandals': 1,
         'currently': 1,
         'plaguing': 1,
         'though': 1,
         'staff': 2,
         'given': 1,
         'trying': 1,
         'professional': 1,
         'conference': 4,
         'call': 5,
         'monday': 1,
         'one': 1,
         'pressing': 1,
         'issues': 1,
         'international': 1,
         'relations': 1,
         'iran': 3,
         'staffers': 2,
         'heard': 2,
         'background': 3,
         'chattin

In [28]:
data['text'] = data['text'].str.lower()

In [29]:
data

Unnamed: 0,index,title,text,subject,date,class
0,0,Donald Trump Sends Out Embarrassing New Year’...,donald trump just couldn t wish all americans ...,News,"December 31, 2017",1
1,1,Drunk Bragging Trump Staffer Started Russian ...,house intelligence committee chairman devin nu...,News,"December 31, 2017",1
2,2,Sheriff David Clarke Becomes An Internet Joke...,"on friday, it was revealed that former milwauk...",News,"December 30, 2017",1
3,3,Trump Is So Obsessed He Even Has Obama’s Name...,"on christmas day, donald trump announced that ...",News,"December 29, 2017",1
4,4,Pope Francis Just Called Out Donald Trump Dur...,pope francis used his annual christmas day mes...,News,"December 25, 2017",1
...,...,...,...,...,...,...
44893,21412,'Fully committed' NATO backs new U.S. approach...,brussels (reuters) - nato allies on tuesday we...,worldnews,"August 22, 2017",0
44894,21413,LexisNexis withdrew two products from Chinese ...,"london (reuters) - lexisnexis, a provider of l...",worldnews,"August 22, 2017",0
44895,21414,Minsk cultural hub becomes haven from authorities,minsk (reuters) - in the shadow of disused sov...,worldnews,"August 22, 2017",0
44896,21415,Vatican upbeat on possibility of Pope Francis ...,moscow (reuters) - vatican secretary of state ...,worldnews,"August 22, 2017",0


In [38]:
# This is the attempt to create vectorization of words without preprocessing
vect = CountVectorizer(max_features=5000, stop_words = ENGLISH_STOP_WORDS)
vect.fit(data['text'])
X = vect.transform(data['text'])

In [39]:
my_array = X.toarray()
X_df = pd.DataFrame(my_array, columns=vect.get_feature_names())

In [40]:
X_df

Unnamed: 0,00,000,10,100,11,12,120,13,14,15,...,youtube,ypg,zealand,zero,zika,zimbabwe,zone,zones,zor,zuma
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,0,2,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
44894,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44896,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
