In [2]:
#importing libraries
import pandas as pd
import re
import string
import nltk

import warnings
warnings.filterwarnings('ignore')

print("Setup Complete")

Setup Complete


In [3]:
df = pd.read_csv("Final dataset.csv")
df = df[['content', 'label']]
df.head()

Unnamed: 0,content,label
0,Giuliani: Tax report proves Trump is a 'genius...,0
1,"#Trump in 2016 Temp:Crab Orchard, Ky."":62.1°F ...",0
2,Iconic Charcoaler hamburger brand can be yours...,1
3,New audio: Clinton refers to Sanders fans as f...,0
4,Can I make money sleeping please @SYeckering @...,1


In [4]:
#Converting to lower case
df['content'] = df['content'].str.lower()
df

Unnamed: 0,content,label
0,giuliani: tax report proves trump is a 'genius...,0
1,"#trump in 2016 temp:crab orchard, ky."":62.1°f ...",0
2,iconic charcoaler hamburger brand can be yours...,1
3,new audio: clinton refers to sanders fans as f...,0
4,can i make money sleeping please @syeckering @...,1
...,...,...
17574,.@cavs defeat boston in the eastern conference...,1
17575,so how do you explain trump’s continued popula...,0
17576,time is running short for opponents of gmo lab...,1
17577,#igetdepressedwhen i have to pay rent https://...,1


In [5]:
#Removing digits and words containing digits
df['content'] = df['content'].str.replace('\w*\d\w*', '')
df

Unnamed: 0,content,label
0,giuliani: tax report proves trump is a 'genius...,0
1,"#trump in temp:crab orchard, ky."":.°f wind:. ...",0
2,iconic charcoaler hamburger brand can be yours...,1
3,new audio: clinton refers to sanders fans as f...,0
4,can i make money sleeping please @syeckering @...,1
...,...,...
17574,.@cavs defeat boston in the eastern conference...,1
17575,so how do you explain trump’s continued popula...,0
17576,time is running short for opponents of gmo lab...,1
17577,#igetdepressedwhen i have to pay rent https://...,1


In [6]:
#Removing URLs
url_pattern = re.compile(r'https?://\S+|www\.\S+')
df['content'] = df['content'].str.replace(url_pattern, '')
df

Unnamed: 0,content,label
0,giuliani: tax report proves trump is a 'genius...,0
1,"#trump in temp:crab orchard, ky."":.°f wind:. ...",0
2,iconic charcoaler hamburger brand can be yours...,1
3,new audio: clinton refers to sanders fans as f...,0
4,can i make money sleeping please @syeckering @...,1
...,...,...
17574,.@cavs defeat boston in the eastern conference...,1
17575,so how do you explain trump’s continued popula...,0
17576,time is running short for opponents of gmo lab...,1
17577,#igetdepressedwhen i have to pay rent,1


In [7]:
#Removing punctuations
df['content'] = df['content'].str.replace("[^a-zA-Z]", " ")
df

Unnamed: 0,content,label
0,giuliani tax report proves trump is a genius...,0
1,trump in temp crab orchard ky f wind ...,0
2,iconic charcoaler hamburger brand can be yours...,1
3,new audio clinton refers to sanders fans as f...,0
4,can i make money sleeping please syeckering ...,1
...,...,...
17574,cavs defeat boston in the eastern conference...,1
17575,so how do you explain trump s continued popula...,0
17576,time is running short for opponents of gmo lab...,1
17577,igetdepressedwhen i have to pay rent,1


In [8]:
#Removing emojis

df['content'] = df['content'].astype(str)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['content'] = df['content'].apply(lambda text: remove_emoji(text))
df

Unnamed: 0,content,label
0,giuliani tax report proves trump is a genius...,0
1,trump in temp crab orchard ky f wind ...,0
2,iconic charcoaler hamburger brand can be yours...,1
3,new audio clinton refers to sanders fans as f...,0
4,can i make money sleeping please syeckering ...,1
...,...,...
17574,cavs defeat boston in the eastern conference...,1
17575,so how do you explain trump s continued popula...,0
17576,time is running short for opponents of gmo lab...,1
17577,igetdepressedwhen i have to pay rent,1


In [9]:
#Removing stopwords

df['content'] = df['content'].astype(str)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    filtered_sentence = []
    
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
            
    return filtered_sentence


df['content'] = df['content'].apply(lambda text: remove_stopwords(text))
df

Unnamed: 0,content,label
0,"[giuliani, tax, report, proves, trump, genius,...",0
1,"[trump, temp, crab, orchard, ky, f, wind, pres...",0
2,"[iconic, charcoaler, hamburger, brand]",1
3,"[new, audio, clinton, refers, sanders, fans, f...",0
4,"[make, money, sleeping, please, syeckering, le...",1
...,...,...
17574,"[cavs, defeat, boston, eastern, conference, fi...",1
17575,"[explain, trump, continued, popularity, stephe...",0
17576,"[time, running, short, opponents, gmo, labelin...",1
17577,"[igetdepressedwhen, pay, rent]",1


In [10]:
#Stemming

df['content'] = df['content'].astype(str)

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
df['content'] = df['content'].apply(lambda x: " ".join([porter.stem(word) for word in x.split()]))

df

Unnamed: 0,content,label
0,"['giuliani', 'tax', 'report', 'proves', 'trump...",0
1,"['trump', 'temp', 'crab', 'orchard', 'ky', 'f'...",0
2,"['iconic', 'charcoaler', 'hamburger', 'brand']",1
3,"['new', 'audio', 'clinton', 'refers', 'sanders...",0
4,"['make', 'money', 'sleeping', 'please', 'syeck...",1
...,...,...
17574,"['cavs', 'defeat', 'boston', 'eastern', 'confe...",1
17575,"['explain', 'trump', 'continued', 'popularity'...",0
17576,"['time', 'running', 'short', 'opponents', 'gmo...",1
17577,"['igetdepressedwhen', 'pay', 'rent']",1


In [12]:
#Lemmatization

df['content'] = df['content'].astype(str)

from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()
df['content'] = df['content'].apply(lambda x: " ".join([wordnet.lemmatize(word) for word in x.split()]))

df

Unnamed: 0,content,label
0,"['giuliani', 'tax', 'report', 'proves', 'trump...",0
1,"['trump', 'temp', 'crab', 'orchard', 'ky', 'f'...",0
2,"['iconic', 'charcoaler', 'hamburger', 'brand']",1
3,"['new', 'audio', 'clinton', 'refers', 'sanders...",0
4,"['make', 'money', 'sleeping', 'please', 'syeck...",1
...,...,...
17574,"['cavs', 'defeat', 'boston', 'eastern', 'confe...",1
17575,"['explain', 'trump', 'continued', 'popularity'...",0
17576,"['time', 'running', 'short', 'opponents', 'gmo...",1
17577,"['igetdepressedwhen', 'pay', 'rent']",1


In [11]:
df.to_csv("Final dataset 2.csv")
df

Unnamed: 0,content,label
0,"['giuliani', 'tax', 'report', 'proves', 'trump...",0
1,"['trump', 'temp', 'crab', 'orchard', 'ky', 'f'...",0
2,"['iconic', 'charcoaler', 'hamburger', 'brand']",1
3,"['new', 'audio', 'clinton', 'refers', 'sanders...",0
4,"['make', 'money', 'sleeping', 'please', 'syeck...",1
...,...,...
17574,"['cavs', 'defeat', 'boston', 'eastern', 'confe...",1
17575,"['explain', 'trump', 'continued', 'popularity'...",0
17576,"['time', 'running', 'short', 'opponents', 'gmo...",1
17577,"['igetdepressedwhen', 'pay', 'rent']",1
