Module Imports

In [58]:
# Cleaning the Text Data
import nltk
import pandas as pd 
import numpy as np
import matplotlib.pyplot as  plt
import seaborn as sns
import re 
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet

# Model building
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

# Model evaluation
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

Loading Train Dataset

In [59]:
#Loading and assigning a variable to the train dataset
df_train = pd.read_csv('train_6.csv', skipinitialspace = True)

In [60]:
#Checking the train dataset.
df_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


Cleaning Train Dataset

In [61]:
#Dataset Cleaning Function
def CleanTweets(tweets):
    # Converts from upper case to lower case
    tweets = tweets.lower()
    # removes the rt and wired strings at the begining of tweets
    tweets = tweets.replace('rt','')
    tweets = tweets.replace('wired','')
    # remove emojis
    tweets = tweets.replace('[^A-Za-z0-9]','')
    # removing numbers
    tweets = re.sub(r'\d+','',tweets)
    # removes @ mentions 
    tweets = re.sub('@[\w]*','',tweets)
    # removes urls
    tweets = re.sub(r'https?:\/\/.*\/\w*','',tweets)
    # removes hashtags
    tweets = re.sub(r'#\w*','',tweets)
    # removes punctuation
    tweets = ''.join([l for l in tweets if l not in string.punctuation])   
    # removes funny diamond
    tweets = re.sub(r"U+FFFD ",'', tweets)
    # Removes extra white space
    tweets = re.sub(r'\s\s+','',tweets)
    # removes the newline characters [\n] from pandas column
    tweets = tweets.replace('\n', ' ')
    # removes space infront of tweet
    tweets = tweets.strip()

    return tweets

In [62]:
# Applying the Dataset Cleaning Function
df_train['message'] = df_train['message'].apply(CleanTweets)

In [63]:
# Checking the Train Dataset after Cleaning
df_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,polyscimajor epa chief doesnt think carbon dio...,625221
1,1,its not like we lack evidence of anthropogenic...,126103
2,2,researchers say we have three years to act on ...,698562
3,1,was a pivotal year in the war on climate change,573736
4,1,itsand a racist sexist climate change denying ...,466954


In [64]:
# Checking the Messages Column after Cleaning
df_train['message'].head(60)

0     polyscimajor epa chief doesnt think carbon dio...
1     its not like we lack evidence of anthropogenic...
2     researchers say we have three years to act on ...
3       was a pivotal year in the war on climate change
4     itsand a racist sexist climate change denying ...
5     woh a read whether you do or dont believe in c...
6     mike pence doesn’t believe in global warming o...
7     six big things we can all do today to fight cl...
8     my yo nephew is inconsolable he wants to die o...
9     no offense… but like… how do you just not beli...
10    shes thinking about how shes going to die beca...
11    i do hope people who are vocal about climate c...
12    we only have apercent chance of avoiding ‘dang...
13    oh my godtrumps government removes climate cha...
14    fossil fuel giant exxonmobil ‘misled’ the publ...
15    i dont wanna live forever – and nothing will b...
16    issues scrubbed fromtoday civil rights climate...
17    if our elected leaders fail to approach th

Tokenization

In [65]:
# Tokenizing the messages column and adding the result to new [Tokenized messages] column
df_train['Tokenized messages'] = df_train['message'].apply(word_tokenize)

In [66]:
# Checking the Tokenized messages column
df_train['Tokenized messages'].head()

0    [polyscimajor, epa, chief, doesnt, think, carb...
1    [its, not, like, we, lack, evidence, of, anthr...
2    [researchers, say, we, have, three, years, to,...
3    [was, a, pivotal, year, in, the, war, on, clim...
4    [itsand, a, racist, sexist, climate, change, d...
Name: Tokenized messages, dtype: object

Adding New Featuers

In [67]:
# Adding word count feature to the train dataset
df_train['word count'] = df_train['message'].str.len()

In [68]:
# Adding Part of Speech tag to Tokenized messages and assigning value to new [message POS] featue
df_train['message POS'] = df_train['Tokenized messages'].apply(nltk.tag.pos_tag)

In [69]:
# Assigning part of speech tag to tokens
def POS_wordnet(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    

Stop Words

In [70]:
# Creting Stop Word removal feature
def no_stop(row):
    return[word for word in row if word not in stopwords.words('english')]

In [71]:
# Applying stop word removal function to Tokenized messages
df_train['Tokenized messages'] = df_train['Tokenized messages'].apply(lambda x: no_stop(x))

In [72]:
# Cheking the Tokenized messages column
df_train['Tokenized messages'].head(60)

0     [polyscimajor, epa, chief, doesnt, think, carb...
1     [like, lack, evidence, anthropogenic, global, ...
2     [researchers, say, three, years, act, climate,...
3                 [pivotal, year, war, climate, change]
4     [itsand, racist, sexist, climate, change, deny...
5     [woh, read, whether, dont, believe, climate, c...
6     [mike, pence, ’, believe, global, warming, smo...
7     [six, big, things, today, fight, climate, chan...
8     [yo, nephew, inconsolable, wants, die, old, ag...
9       [offense…, like…, believe…, global, warming………]
10    [shes, thinking, shes, going, die, husband, do...
11    [hope, people, vocal, climate, change, also, p...
12    [apercent, chance, avoiding, ‘, dangerous, ’, ...
13    [oh, godtrumps, government, removes, climate, ...
14    [fossil, fuel, giant, exxonmobil, ‘, misled, ’...
15    [dont, wan, na, live, forever, –, nothing, cli...
16    [issues, scrubbed, fromtoday, civil, rights, c...
17    [elected, leaders, fail, approach, environ

Lemmatization

In [73]:
#Assiging the WordNetLemmatizer to a variable [wnl]
wnl = nltk.WordNetLemmatizer()

In [74]:
# Creating lematizer function to apply to messages column
def lemmatize_text(row):
    return [wnl.lemmatize(word) for word in row] 

In [75]:
# Lemmatizing the Tokenized messages column and assigning the value to a new feature [lemma]
df_train['lemma'] = df_train['Tokenized messages'].apply(lambda x: lemmatize_text(x))

In [76]:
# Checking the lemma column
df_train['lemma'].head()

0    [polyscimajor, epa, chief, doesnt, think, carb...
1    [like, lack, evidence, anthropogenic, global, ...
2    [researcher, say, three, year, act, climate, c...
3                [pivotal, year, war, climate, change]
4    [itsand, racist, sexist, climate, change, deny...
Name: lemma, dtype: object

Model Building 

In [77]:
# Checking the df_train dataset
df_train.head()

Unnamed: 0,sentiment,message,tweetid,Tokenized messages,word count,message POS,lemma
0,1,polyscimajor epa chief doesnt think carbon dio...,625221,"[polyscimajor, epa, chief, doesnt, think, carb...",99,"[(polyscimajor, JJ), (epa, NN), (chief, NN), (...","[polyscimajor, epa, chief, doesnt, think, carb..."
1,1,its not like we lack evidence of anthropogenic...,126103,"[like, lack, evidence, anthropogenic, global, ...",61,"[(its, PRP$), (not, RB), (like, IN), (we, PRP)...","[like, lack, evidence, anthropogenic, global, ..."
2,2,researchers say we have three years to act on ...,698562,"[researchers, say, three, years, act, climate,...",83,"[(researchers, NNS), (say, VBP), (we, PRP), (h...","[researcher, say, three, year, act, climate, c..."
3,1,was a pivotal year in the war on climate change,573736,"[pivotal, year, war, climate, change]",47,"[(was, VBD), (a, DT), (pivotal, JJ), (year, NN...","[pivotal, year, war, climate, change]"
4,1,itsand a racist sexist climate change denying ...,466954,"[itsand, racist, sexist, climate, change, deny...",75,"[(itsand, VB), (a, DT), (racist, NN), (sexist,...","[itsand, racist, sexist, climate, change, deny..."


In [79]:
df_train['lemma'].head()

0    [polyscimajor, epa, chief, doesnt, think, carb...
1    [like, lack, evidence, anthropogenic, global, ...
2    [researcher, say, three, year, act, climate, c...
3                [pivotal, year, war, climate, change]
4    [itsand, racist, sexist, climate, change, deny...
Name: lemma, dtype: object

Vectorizing the lemma feature 

In [80]:
def token(text):
    return text

In [81]:
vect = TfidfVectorizer(tokenizer = token, lowercase = False)
vect_tmessages = vect.fit_transform(df_train['lemma'])

In [82]:
# Train test splitting
X = vect_tmessages
Y = df_train['sentiment']

In [83]:
#Setting the train test ratio and assiging X,y(train, test) values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 42)

In [87]:

forest = RandomForestClassifier(n_estimators=100, random_state=50)
forest.fit(X_train, y_train)

In [88]:
# Generating predictions using the random forest model
prediction = forest.predict(X_test) 

Model Evaluation

In [89]:
# Print of the model performance
print(metrics.classification_report(y_test, prediction))

              precision    recall  f1-score   support

          -1       0.87      0.23      0.36       336
           0       0.58      0.31      0.40       547
           1       0.68      0.91      0.78      2178
           2       0.76      0.57      0.65       894

    accuracy                           0.69      3955
   macro avg       0.72      0.50      0.55      3955
weighted avg       0.70      0.69      0.66      3955

