## Import Libraries

In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import tokenize
import warnings
warnings.filterwarnings('ignore')

## Load Dataset

In [2]:
data = pd.read_csv("FakeReal.csv")
data.shape

(106239, 5)

## Deleting rows will NaN/Empty cells

In [3]:
# Removing the title (we will only use the text)
data.dropna(axis=1,inplace=True)
data = data[data.text != " "].reset_index(drop = True)
data = data[(~data["text"].isna())].reset_index(drop = True)
data

Unnamed: 0,title,text,target,date,subject
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,real,"March 07, 2018",politics
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",real,"May 14, 2016",news
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,fake,"September 11, 2018",politics
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",real,"September 14, 2017",politicsNews
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,real,"July 08, 2018",news
...,...,...,...,...,...
105141,HA-HA! RETIRED MARINE TROLLS LIBERAL NEW YORKE...,It s always a win-win when you can find a hobb...,real,"January 13, 2018",news
105142,House Republican Busted For Illegally Using C...,It is a violation of federal election laws to ...,real,"April 28, 2016",politicsNews
105143,UNHINGED MIKA Called President Trump “Not Well...,Hour one of the MSNBC morning show began typic...,real,"April 20, 2016",politics
105144,WATCH: HILLARY CALLS PARENT Of Benghazi Victim...,Apparently being the wife of a former Democrat...,real,"June 21, 2016",politicsNews


## Dropping unnecessary columns

In [4]:
# Removing the date and subject (we won't use it for the analysis)
data.drop(["date"],axis=1,inplace=True)
data.drop(["subject"],axis=1,inplace=True)
data

Unnamed: 0,title,text,target
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,real
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",real
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,fake
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",real
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,real
...,...,...,...
105141,HA-HA! RETIRED MARINE TROLLS LIBERAL NEW YORKE...,It s always a win-win when you can find a hobb...,real
105142,House Republican Busted For Illegally Using C...,It is a violation of federal election laws to ...,real
105143,UNHINGED MIKA Called President Trump “Not Well...,Hour one of the MSNBC morning show began typic...,real
105144,WATCH: HILLARY CALLS PARENT Of Benghazi Victim...,Apparently being the wife of a former Democrat...,real


In [5]:
# Removing the title (we will only use the text)
data.drop(["title"],axis=1,inplace=True)
data

Unnamed: 0,text,target
0,No comment is expected from Barack Obama Membe...,real
1,"Now, most of the demonstrators gathered last ...",real
2,A dozen politically active pastors came here f...,fake
3,"The RS-28 Sarmat missile, dubbed Satan 2, will...",real
4,All we can say on this one is it s about time ...,real
...,...,...
105141,It s always a win-win when you can find a hobb...,real
105142,It is a violation of federal election laws to ...,real
105143,Hour one of the MSNBC morning show began typic...,real
105144,Apparently being the wife of a former Democrat...,real


## Remove HTML content

In [6]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

data['text'] = data['text'].apply(strip_html)
data

Unnamed: 0,text,target
0,No comment is expected from Barack Obama Membe...,real
1,"Now, most of the demonstrators gathered last ...",real
2,A dozen politically active pastors came here f...,fake
3,"The RS-28 Sarmat missile, dubbed Satan 2, will...",real
4,All we can say on this one is it s about time ...,real
...,...,...
105141,It s always a win-win when you can find a hobb...,real
105142,It is a violation of federal election laws to ...,real
105143,Hour one of the MSNBC morning show began typic...,real
105144,Apparently being the wife of a former Democrat...,real


## Remove URLs

In [7]:
def remove_url(text):
    return re.sub(r'http\S+', '', text)

data['text'] = data['text'].apply(remove_url)
data

Unnamed: 0,text,target
0,No comment is expected from Barack Obama Membe...,real
1,"Now, most of the demonstrators gathered last ...",real
2,A dozen politically active pastors came here f...,fake
3,"The RS-28 Sarmat missile, dubbed Satan 2, will...",real
4,All we can say on this one is it s about time ...,real
...,...,...
105141,It s always a win-win when you can find a hobb...,real
105142,It is a violation of federal election laws to ...,real
105143,Hour one of the MSNBC morning show began typic...,real
105144,Apparently being the wife of a former Democrat...,real


## Convert to lower case

In [8]:
data['text'] = data['text'].apply(lambda x: x.lower())
data

Unnamed: 0,text,target
0,no comment is expected from barack obama membe...,real
1,"now, most of the demonstrators gathered last ...",real
2,a dozen politically active pastors came here f...,fake
3,"the rs-28 sarmat missile, dubbed satan 2, will...",real
4,all we can say on this one is it s about time ...,real
...,...,...
105141,it s always a win-win when you can find a hobb...,real
105142,it is a violation of federal election laws to ...,real
105143,hour one of the msnbc morning show began typic...,real
105144,apparently being the wife of a former democrat...,real


## Remove punctuations

In [9]:
def punctuation_removal(text):
    sp = string.punctuation
    sp.join('“”’')
    for p in sp:
        text = text.replace(p, "")
    return text

data['text'] = data['text'].apply(punctuation_removal)
data

Unnamed: 0,text,target
0,no comment is expected from barack obama membe...,real
1,now most of the demonstrators gathered last n...,real
2,a dozen politically active pastors came here f...,fake
3,the rs28 sarmat missile dubbed satan 2 will re...,real
4,all we can say on this one is it s about time ...,real
...,...,...
105141,it s always a winwin when you can find a hobby...,real
105142,it is a violation of federal election laws to ...,real
105143,hour one of the msnbc morning show began typic...,real
105144,apparently being the wife of a former democrat...,real


## Tokenize the text column cells

In [10]:
def tokenization(text):
    tokens = nltk.word_tokenize(text)
    return tokens

data['text'] = data['text'].apply(tokenization)
data

Unnamed: 0,text,target
0,"[no, comment, is, expected, from, barack, obam...",real
1,"[now, most, of, the, demonstrators, gathered, ...",real
2,"[a, dozen, politically, active, pastors, came,...",fake
3,"[the, rs28, sarmat, missile, dubbed, satan, 2,...",real
4,"[all, we, can, say, on, this, one, is, it, s, ...",real
...,...,...
105141,"[it, s, always, a, winwin, when, you, can, fin...",real
105142,"[it, is, a, violation, of, federal, election, ...",real
105143,"[hour, one, of, the, msnbc, morning, show, beg...",real
105144,"[apparently, being, the, wife, of, a, former, ...",real


## Remove stopwords

In [11]:
stopwords = nltk.corpus.stopwords.words('english')
i=0
def remove_stopwords(text):
    output = []
    for w in text:
        if w not in stopwords and len(w)>4:
            output.append(w)
    return output

data['text'] = data['text'].apply(remove_stopwords)
data

Unnamed: 0,text,target
0,"[comment, expected, barack, obama, members, fy...",real
1,"[demonstrators, gathered, night, exercising, c...",real
2,"[dozen, politically, active, pastors, private,...",fake
3,"[sarmat, missile, dubbed, satan, replace, flie...",real
4,"[someone, southern, poverty, centeron, tuesday...",real
...,...,...
105141,"[always, winwin, hobby, enjoy, especially, all...",real
105142,"[violation, federal, election, campaign, funds...",real
105143,"[msnbc, morning, began, typically, enough, ass...",real
105144,"[apparently, former, democrat, president, seri...",real


## Lemmatize the words

In [12]:
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text):
    lemm_text = ' '.join([wordnet_lemmatizer.lemmatize(word) for word in text])
    return lemm_text

data['text'] = data['text'].apply(lemmatizer)
data

Unnamed: 0,text,target
0,comment expected barack obama member fyf911 fu...,real
1,demonstrator gathered night exercising constit...,real
2,dozen politically active pastor private dinner...,fake
3,sarmat missile dubbed satan replace fly mile r...,real
4,someone southern poverty centeron tuesday jame...,real
...,...,...
105141,always winwin hobby enjoy especially allows ex...,real
105142,violation federal election campaign fund perso...,real
105143,msnbc morning began typically enough assembled...,real
105144,apparently former democrat president serial pe...,real


## Save the preprocessed dataset

In [13]:
data.to_csv("PreprocessedRealFake.csv", index=False)