In [None]:
# BDA_102 - Stan Lee's Sentiment Model - Group 3 - December 2018
# Iwona Rozco, Jeongseon Lee, Tatiana Khayrullina, Qasim Ali, Roberto Hidalgo

In [28]:
#First step was to gather information from tweetr, save it into a json file and transform it into a csv.
# Modifications will be performed in three directions
# Exclusion of irrelevant records: Only English, fresh no retweets that include references to the compund 'Stan Lee' were 
#           included 
# Exclusion of irrelevant characters: regex was used to clean the text to its very readable bone (off the uctuation).
# Exclusion of irrelevant words: NTLK was used to exclude the more frequent but irrelevant words of the english language.
# In addition to that the text was standardized to lower letters.

#The file to import is 'C:/Users/hidalgo/Documents/BDA_102_FInal_Group_Project/Data_4_model_FromAliQuasim.csv'
#Tweets & Class, tweets text is still dirty, needs to be cleaned but includes the identification of the cases as 
# negative(0), positive(1) or neutral(2)
# In our case: 
# Negative sentiment (offensive, ironic, derogatory and generally negative comments about Stan Lee), labelled as 0
# Affirmative sentiment (sad, nostalgic, celebratory, and generally positive comments about Stan Lee), labelled as 1
# Neutral sentiment (fact based, Stan Lee can be a reference but no judgment is made as so justify a -/+ label), labelled as 2


In [29]:
#Lets try to use Pandas to bring the information into computer memory and then turn it into a manageable format
import pandas as pd
stan_lee_raw_00 = pd.read_csv('C:/Users/hidalgo/Documents/BDA_102_FInal_Group_Project/Data_4_model_FromAliQuasim.csv', sep=',')

In [32]:
print(stan_lee_raw_00.shape)
print(stan_lee_raw_00.columns)
# The file has 4 variables but only 2 are relevant for the analysis: 
# Sentiments -labeled as 0,1,2 as negative, positive, neutral
# text - the actual twitter content that needs further cleaning

#all records are in englisg so the variable can be excluded , unnamed (0) is redundant.

(1804, 4)
Index(['order', 'Sentiments', 'text', 'lang'], dtype='object')


In [34]:
stan_lee_raw_01 = stan_lee_raw_00.drop(['order','lang'],axis=1)
print(stan_lee_raw_01.shape)
print(stan_lee_raw_01.columns)
#Things are in order now

(1804, 2)
Index(['Sentiments', 'text'], dtype='object')


In [35]:
#Lets' clean the file for unwanted characters
stan_lee_raw_01['text'].replace(to_replace='[+âœœïºâ€™ðÿâ*$€™&@ðŸ˜”ðŸ˜­ðŸ˜✒️👀✨😤🙏🏼!#?,.:";]', value= '', regex=True, inplace=True)
stan_lee_raw_01['text'].replace(to_replace='[\r\n]', value= '', regex=True, inplace=True)
stan_lee_raw_01['text'].replace(to_replace='|', value= '', regex=True, inplace=True)


print (stan_lee_raw_01)

      Sentiments                                               text
0              1  Enough said - Stan the man Lee https//tco/zHTL...
1              2  RIP in Peace Stan Lee from MiddleUrinalPod sou...
2              1  StanLeeForever Remix EDM StanLee marvelcomics ...
3              1  Stan Lee Inspired Us All Celebrate His 95 Year...
4              2  Now for something breathtaking A new episode f...
5              2  Rap De Stan Lee EN ESPAÑOL || Frikirap || CriC...
6              2  ZoomTV wreckitralph - Stan Lee RalphWithZoomJo...
7              2  my birthday wish listcarbon monoxide scented c...
8              1  Marvel Studios chief Kevin Feige wrote a tribu...
9              2  Kevin Smith Reacts to Bill Maher Taking a Shot...
10             2  Marvel Studios chief Kevin Feige recalled his ...
11             1  Marvel Studios chief Kevin Feige wrote a tribu...
12             2  ShinGokaiRed I love marvel and Stan Lee but I ...
13             2  I liked a YouTube video https/

In [36]:
#Now let's standarize everything into lower letters
stan_lee_raw_01['text'] = stan_lee_raw_01['text'].str.lower()
print (stan_lee_raw_01)

      Sentiments                                               text
0              1  enough said - stan the man lee https//tco/zhtl...
1              2  rip in peace stan lee from middleurinalpod sou...
2              1  stanleeforever remix edm stanlee marvelcomics ...
3              1  stan lee inspired us all celebrate his 95 year...
4              2  now for something breathtaking a new episode f...
5              2  rap de stan lee en español || frikirap || cric...
6              2  zoomtv wreckitralph - stan lee ralphwithzoomjo...
7              2  my birthday wish listcarbon monoxide scented c...
8              1  marvel studios chief kevin feige wrote a tribu...
9              2  kevin smith reacts to bill maher taking a shot...
10             2  marvel studios chief kevin feige recalled his ...
11             1  marvel studios chief kevin feige wrote a tribu...
12             2  shingokaired i love marvel and stan lee but i ...
13             2  i liked a youtube video https/

In [38]:
#Let's split the file now so the tweet information is separated from the class
class_list = stan_lee_raw_01['Sentiments']
feature_text = stan_lee_raw_01['text']
print(class_list.shape)
print(feature_text.shape)

(1804,)
(1804,)


In [40]:
print(type(feature_text))

<class 'pandas.core.series.Series'>


In [41]:
#Now the information is clean, it is time to introduce the nltk tools  
#The CountVectorizer provides not only a simple way to both tokenize a collection of text documents 
#and build a vocabulary of known words, but also to encode new documents using that vocabulary
 
import nltk
from sklearn.feature_extraction.text import CountVectorizer
#that's a safe way to exclude from the list of a good chunk of words that are frequent but not relevant
vectorizer =CountVectorizer(stop_words='english') 

# To actually create the vectorizer, we need to call fit on the text
# data that we wish to fix
#vectorizer.fit(sample_text) ..In the case sample_text = feature_text
vectorizer.fit(feature_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [42]:
# Now, we can inspect how the vectorizer 'vectorized' the text
# This will print out a list of words used, and their index in the vectors
#The vocabulary has a dictionary format {'text': frequency}
#The function is: print(vectorizer.vocabulary_)
print(vectorizer.vocabulary_)

{'said': 4396, 'stan': 4728, 'man': 3154, 'lee': 2950, 'https': 2430, 'tco': 4937, 'zhtlfsj2vw': 5747, 'rip': 4276, 'peace': 3772, 'middleurinalpod': 3302, 'sound': 4668, 'amazing': 485, 'doesn': 1496, 'know': 2858, 'want': 5434, 'click': 1094, 'link': 3020, 'wckl8xdxje': 5457, 'stanleeforever': 4734, 'remix': 4191, 'edm': 1638, 'stanlee': 4733, 'marvelcomics': 3193, 'dubstep': 1570, 'youtube': 5704, 'finally': 1899, 'complete': 1159, 'dccomics': 1345, 'byebiuaifq': 902, 'inspired': 2572, 'celebrate': 986, '95': 329, 'years': 5671, 'a11vlkmjdq': 359, 'breathtaking': 839, 'new': 3505, 'episode': 1709, 'dinnerpodcast': 1460, 'news': 3511, 'kimhwj0lcs': 2837, 'rap': 4107, 'en': 1677, 'español': 1724, 'frikirap': 1986, 'cricri': 1253, 'no9fu32f30': 3541, 'thanks': 4984, 'fun': 2007, 'zoomtv': 5759, 'wreckitralph': 5567, 'ralphwithzoomjoin': 4099, 'winimmacristy': 5506, 'imtophey': 2531, 'imbertjr': 2512, 'dyajahkim': 1588, 'imcarlc': 2513, 'birthday': 758, 'wish': 5512, 'listcarbon': 3023,

In [45]:
print(type(vectorizer.vocabulary))

<class 'NoneType'>


In [51]:
#Here a matrix is created for all the data
#print(vectorizer.fit_transform(list_data).todense())
bag_wd=vectorizer.fit_transform(feature_text).toarray()
print(bag_wd)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
#Here a matrix is created for all the data
#print(vectorizer.fit_transform(list_data).todense())
bag_wd=vectorizer.fit_transform(tweet).toarray()
print(bag_wd)

In [52]:
#Pandas intervention: A DataFrame is created with the bag of words information
# the column names correspond to the words (features)
bagw_df=pd.DataFrame(bag_wd,columns=vectorizer.get_feature_names())
bagw_df.shape

(1804, 5767)

In [55]:
#no need to do anythin with class_list since it is a panda series already with a perfect match (no sort in any of the components)
# to the three classes 0,1,2)
class_list.value_counts()

2    917
1    579
0    308
Name: Sentiments, dtype: int64

In [56]:
#For building a model to predict sentiment it is necessary to create the X,Y sets, split them into train and test,
# proceed to select an specific model given the nature of the data, fit the data, create a prediction and find out how accurate 
# the model is at predicting.

#So, for X and Y
X = bagw_df
Y = class_list

In [57]:
#Now we model using sklearn: The option will be for the Naive Bayesian model which is pretty straight forward in terms 
# of estimation and appropiate in terms of the data profile. The multinomial version adjusts well to the existence of more than
# two possible outcomes. In our case 
# Negative sentiment (offensive, ironic, derogatory and generally negative comments about Stan Lee), labelled as 0
# Affirmative sentiment (sad, nostalgic, celebratory, and generally positive comments about Stan Lee), labelled as 1
# Neutral sentiment (fact based, Stan Lee can be a reference but no judgment is made as so justify a -/+ label), labelled as 2

from sklearn.model_selection import train_test_split
from sklearn import  metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [58]:
#The syntaxis for the estimation is 
#x_train, x_test, y_train, y_test = train_test_split(  X, Y, test_size='', random_state=)
#x_train, x_test, y_train, y_test = train_test_split(df, df_class, test_size=0.3, random_state=58)

#what is happening here is that df and df class are used as x, y for the estimation which makes sense

#y is mainly a vector of 2,1,0 values based on the attribute neutral,pos,neg, x gets the words that might predict neutral,pos,neg

#the test_size is 0.3 meaning that the train set is 70% of the data randomly selected, with the rermaining 30% reserved for test.

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=58)

#The Gaussian option is selected but the model is still not run
stan_lee_model = MultinomialNB()

In [59]:
# We proceed to estimate. 
#No prior probabilities defined for the classes (priors=None)

stan_lee_model.fit(x_train, y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [61]:
#Here the model is used to estimate a prediction vector using x_test
stan_lee_pred=stan_lee_model.predict(x_test)

In [62]:
# Now the model is put to the test ... Is it good? 
# The accuracy score provides an answer
# metrics.accuracy_score(desired_output,output)
# produces a value between 0 (bad) and 1 (too good) that indicates how good the model performs
# it is applied for the test set, new data y_test and stan_lee_pred, which is the prediction of the stan_lee_model using y_test

print(metrics.accuracy_score(y_test,stan_lee_pred))


0.5129151291512916


In [None]:
# As expected the model does not show much accuracy. It is better than using a dice to predict sentiment but yet, 
# not a clear classification between the good and bad sentiment.

In [63]:
# Another metric that defines the performance of the Stan_lee model is the CONFUSION Matrix, whose 
# main diagonal indicates the number of matches between prediction and reality (the model stating negative, positive or 
# neutral sentiment versus the actual labeling of the tweet.)
print(metrics.confusion_matrix(y_test, stan_lee_pred))

[[ 15  43  35]
 [ 23  87  67]
 [ 33  63 176]]


In [None]:
# The main row is 15+87+176 =  278/542 ==> 51.29% as stated before.  

In [64]:
#Summary of indicators 
#The information in the confusion matrix is properly summarized using two criteria:

# Precision: How many of the predicted by the model cases are actually good predictions
# Recall: How many of the actual cases are actually captured by the model.

print(classification_report(y_test, output))

             precision    recall  f1-score   support

          0       0.21      0.16      0.18        93
          1       0.45      0.49      0.47       177
          2       0.63      0.65      0.64       272

avg / total       0.50      0.51      0.51       542



In [None]:
# The model faces some challenges when predicting negatives ... understandable given the scenario the data was pulled 
# in which the general tone was not a negative one and the mood was basically positive and there was a positive tone even
# in the negative tweets detected. There is an interesting lesson here which is that the actual dictionary of ANY text project
# might need to be a contruction of words built on extraction at different times and circumstances. 
# Seems to us the best way to prevent bias and create missing cases by simply design.
# In this current Stan Lee related dictionary,  really offensive words will not be recognized as they were not part of the corpus at mourning time.
# The result here I think is independent of the statistical technique and simply reflects the nature of the data as 
# in the time it was captured. 

In [None]:
# BDA_102 - Stan Lee's Sentiment Model - Group 3 - December 2018
# Iwona Rozco, Jeongseon Lee, Tatiana Khayrullina, Qasim Ali, Roberto Hidalgo