# Imports / Setup

In [24]:
import pandas as pd
# pd.options.display.max_columns = 50
# pd.options.display.max_colwidth = 150

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate,\
                                    GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

import nltk
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
sw = stopwords.words('english')

# Helper Functions/Code

Much of this courtesy of lectures and Saad.

In [25]:
def prepare_doc(doc, stem=False):
    
    # Establish regex pattern, instantiate tokenizer
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    # Create tokens for doc
    doc = regex_token.tokenize(doc)
    # Lowercase all words
    doc = [word.lower() for word in doc]
    # Remove stopwords
    doc = [word for word in doc if word not in sw]
    # Stem!
    stemmer = SnowballStemmer("english")
    if stem:
        doc = [stemmer.stem(word) for word in doc]
    return doc

# Data: Loading, Cleaning, and Munging

In [26]:
data_df = pd.read_csv('../../data/judge_1377884607_tweet_product_company.csv')

In [27]:
data_df

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion
8717,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
8718,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
8719,Some Verizon iPhone customers complained their...,,No emotion toward brand or product


Let's rename the columns for ease of use. Note that `product` is a reserved word in Python; for the second column, initially titled `emotion_in_tweet_is_directed_at`, I use `product_name`.

In [28]:
data_df.rename(columns = {'tweet_text': 'text',
                          'emotion_in_tweet_is_directed_at': 'product_name',
                          'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'},
               inplace = True)

In [29]:
data_df['emotion'].value_counts()

No emotion toward brand or product    5156
Positive emotion                      2869
Negative emotion                       545
I can't tell                           151
Name: emotion, dtype: int64

In [30]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8721 entries, 0 to 8720
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          8720 non-null   object
 1   product_name  3169 non-null   object
 2   emotion       8721 non-null   object
dtypes: object(3)
memory usage: 204.5+ KB


One null value in the `text` column. Let's check that out real quick.

In [31]:
data_df[data_df['text'].isna()]

Unnamed: 0,text,product_name,emotion
6,,,No emotion toward brand or product


In [32]:
data_df.dropna(subset=['text'],
               inplace=True)

In [33]:
print(f"{data_df.product_name.value_counts()}\n")
print(f"{data_df.emotion.value_counts(normalize=True)}\n")

iPad                               910
Apple                              640
iPad or iPhone App                 451
Google                             412
iPhone                             288
Other Google product or service    282
Android App                         78
Android                             74
Other Apple product or service      34
Name: product_name, dtype: int64

No emotion toward brand or product    0.591170
Positive emotion                      0.329014
Negative emotion                      0.062500
I can't tell                          0.017317
Name: emotion, dtype: float64



`I can't tell` makes up less than 2% of our dataset, and doesn't offer much more information in the way of word significance than the tweets labeled `No emotion toward brand or product`. It might be worth dropping records where `emotion` has been recorded as `I can't tell`.

In [34]:
data_df.drop(
    index = data_df[data_df.emotion == "I can't tell"].index,
    inplace = True)

data_df.head()

Unnamed: 0,text,product_name,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [35]:
sample1 = data_df.iloc[10].text

In [36]:
# Testing the custom function
prepare_doc(doc=sample1,
            stem=True)

['find',
 'amp',
 'start',
 'impromptu',
 'parti',
 'sxsw',
 'hurricaneparti',
 'http',
 'bit',
 'ly',
 'gvlrin',
 'wait',
 'til',
 'android',
 'app',
 'come']

In [37]:
full_text_data = data_df['text'].apply(lambda x: prepare_doc(x, stem=True))

full_text_data

0       [wesley, g, iphon, hrs, tweet, rise, austin, d...
1       [jessede, know, fludapp, awesom, ipad, iphon, ...
2              [swonderlin, wait, ipad, also, sale, sxsw]
3       [sxsw, hope, year, festiv, crashi, year, iphon...
4       [sxtxstate, great, stuff, fri, sxsw, marissa, ...
                              ...                        
8716                        [ipad, everywher, sxsw, link]
8717    [wave, buzz, rt, mention, interrupt, regular, ...
8718    [googl, zeiger, physician, never, report, pote...
8719    [verizon, iphon, custom, complain, time, fell,...
8720    [rt, mention, googl, test, check, offer, sxsw,...
Name: text, Length: 8569, dtype: object

In [38]:
data_df['preprocessed_text'] = full_text_data

data_df.head()

Unnamed: 0,text,product_name,emotion,preprocessed_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[wesley, g, iphon, hrs, tweet, rise, austin, d..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,"[jessede, know, fludapp, awesom, ipad, iphon, ..."
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[swonderlin, wait, ipad, also, sale, sxsw]"
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[sxsw, hope, year, festiv, crashi, year, iphon..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[sxtxstate, great, stuff, fri, sxsw, marissa, ..."


In [39]:
data_df['text'][:20]

0     .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1     @jessedee Know about @fludapp ? Awesome iPad/i...
2     @swonderlin Can not wait for #iPad 2 also. The...
3     @sxsw I hope this year's festival isn't as cra...
4     @sxtxstate great stuff on Fri #SXSW: Marissa M...
5     @teachntech00 New iPad Apps For #SpeechTherapy...
7     #SXSW is just starting, #CTIA is around the co...
8     Beautifully smart and simple idea RT @madebyma...
9     Counting down the days to #sxsw plus strong Ca...
10    Excited to meet the @samsungmobileus at #sxsw ...
11    Find &amp; Start Impromptu Parties at #SXSW Wi...
12    Foursquare ups the game, just in time for #SXS...
13    Gotta love this #SXSW Google Calendar featurin...
14    Great #sxsw ipad app from @madebymany: http://...
15    haha, awesomely rad iPad app by @madebymany ht...
16    Holler Gram for iPad on the iTunes App Store -...
17    I just noticed DST is coming this weekend. How...
18    Just added my #SXSW flights to @planely. M

In [42]:
data_df['preprocessed_text'][:20]

0     [wesley, g, iphon, hrs, tweet, rise, austin, d...
1     [jessede, know, fludapp, awesom, ipad, iphon, ...
2            [swonderlin, wait, ipad, also, sale, sxsw]
3     [sxsw, hope, year, festiv, crashi, year, iphon...
4     [sxtxstate, great, stuff, fri, sxsw, marissa, ...
5     [teachntech, new, ipad, app, speechtherapi, co...
7     [sxsw, start, ctia, around, corner, googleio, ...
8     [beauti, smart, simpl, idea, rt, madebymani, t...
9     [count, day, sxsw, plus, strong, canadian, dol...
10    [excit, meet, samsungmobileus, sxsw, show, spr...
11    [find, amp, start, impromptu, parti, sxsw, hur...
12    [foursquar, up, game, time, sxsw, http, j, mp,...
13    [gotta, love, sxsw, googl, calendar, featur, t...
14    [great, sxsw, ipad, app, madebymani, http, tin...
15    [haha, awesom, rad, ipad, app, madebymani, htt...
16    [holler, gram, ipad, itun, app, store, http, c...
17    [notic, dst, come, weekend, mani, iphon, user,...
18    [ad, sxsw, flight, plane, match, peopl, pl

In [40]:
data_df.preprocessed_text.explode().value_counts()[:50]

sxsw         9116
mention      6861
link         4090
ipad         2935
rt           2925
googl        2508
appl         2187
quot         1588
iphon        1505
store        1437
new          1057
app          1002
austin        922
amp           803
launch        802
circl         654
social        637
today         566
android       565
pop           558
get           514
open          498
network       468
line          440
go            416
via           400
call          389
parti         387
free          378
mobil         345
sxswi         333
come          326
use           309
like          309
major         306
win           305
time          304
one           301
check         300
day           280
map           264
w             261
temporari     254
possibl       254
see           250
need          238
look          228
design        225
peopl         223
make          219
Name: preprocessed_text, dtype: int64

Looking at the total value counts, we can identify some other tokens/words that might be worth adding to our stopwords list:
- `sxsw` & `sxswi` / `austin` - tokens that refer to setting - all tweets in corpus seem to be pulled from a South by Southwest (SXSW) festival event.
- `rt` / `link` / `quot` - tokens that refer to Twitter interactions
- `amp` - ampersand (&) character that was improperly transcoded

In [41]:
sw.extend([
    'sxsw',
    'sxswi',
    'austin',
    'rt',
    'quot',
    'mention',
    'link',
    'amp'
])

In [46]:
le = LabelEncoder()

data_df['target'] = le.fit_transform(data_df['emotion'])

le.classes_

array(['Negative emotion', 'No emotion toward brand or product',
       'Positive emotion'], dtype=object)

In [47]:
data_df

Unnamed: 0,text,product_name,emotion,preprocessed_text,target
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[wesley, g, iphon, hrs, tweet, rise, austin, d...",0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,"[jessede, know, fludapp, awesom, ipad, iphon, ...",2
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[swonderlin, wait, ipad, also, sale, sxsw]",2
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[sxsw, hope, year, festiv, crashi, year, iphon...",0
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[sxtxstate, great, stuff, fri, sxsw, marissa, ...",2
...,...,...,...,...,...
8716,Ipad everywhere. #SXSW {link},iPad,Positive emotion,"[ipad, everywher, sxsw, link]",2
8717,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product,"[wave, buzz, rt, mention, interrupt, regular, ...",1
8718,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product,"[googl, zeiger, physician, never, report, pote...",1
8719,Some Verizon iPhone customers complained their...,,No emotion toward brand or product,"[verizon, iphon, custom, complain, time, fell,...",1
