# Sentiment analysis

Predicting Tweet sentiment towards airlines.<br>
data from: https://www.kaggle.com/welkin10/airline-sentiment

In [1]:
import numpy as np
import pandas as pd
import nltk
import string
import re

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/raphael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

### Data

In [4]:
data = pd.read_csv('Tweets.csv')
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [5]:
data['airline_sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

### Preprocessing

In [6]:
# separate based on review
pos = data[data['airline_sentiment']=='positive']
neg = data[data['airline_sentiment']=='negative']
neut = data[data['airline_sentiment']=='neutral']

In [7]:
# balance dataset
neg = neg.iloc[np.random.choice(np.arange(len(neg)),2363,False)]
neut = neut.iloc[np.random.choice(np.arange(len(neut)),2363,False)]
pos = pos

In [8]:
# join
clean = pd.concat([neg,pos,neut])
# keep relevant columns
clean = clean[['text','airline_sentiment']]
# shuffle
clean = clean.sample(frac=1)
# rename labels
clean['sentiment'] = clean['airline_sentiment'].map({'positive':1,'negative':-1,'neutral':0})
clean.drop('airline_sentiment',inplace=True,axis=1)

In [10]:
# sanity check
clean['sentiment'].value_counts()

-1    2363
 1    2363
 0    2363
Name: sentiment, dtype: int64

In [11]:
# check stopwords
' '.join(stopwords.words('english'))

"i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't"

In [12]:
# keeping negated words is probably important for sentiment analysis
' '.join(stopwords.words('english')[:116])

"i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such"

In [13]:
stop = stopwords.words('english')[:116]

In [14]:
# It seems to me that ' and - are important for better text interpretation
# @ are for the hashtags
punc = re.sub("'|-|@",'',string.punctuation)

In [15]:
stem = PorterStemmer()

In [16]:
# process text line helper method
def process(string):
    # lower case
    s = string.lower()
    # remove punctuation
    s = s.translate(str.maketrans('','',punc))
    # make list of words
    s = s.split()
    # remove stop words and stem
    s = [stem.stem(w) for w in s if w not in stop]
    return s

In [17]:
# process all tweets
clean['text'] = clean['text'].map(process)

In [18]:
# extract all unique words
words = {}
for text in clean['text']:
    for w in text:
        words[w] = words.get(w,0) + 1
        
# remove words that only appear once, they are probably not very predictive
words = {k:v for (k,v) in words.items() if v>1}

In [19]:
# generate index map (to allow hash search below)
idx = {i:j for (j,i) in enumerate(words)}

In [20]:
# build sparse matrix for counts
X = np.zeros((len(clean),len(words)),dtype=int)

In [21]:
# populare array
i = 0
for text in clean['text']:
    for w in text:
        if (w in words):
            X[i,idx[w]]+=1
    i+=1

In [24]:
y = clean['sentiment'].values

In [26]:
# sanity check
X.shape,y.shape

((7089, 3497), (7089,))

In [27]:
# since X and y are already shuffled, this split will work fine
X_train = X[:5000]
y_train = y[:5000]
X_test = X[5000:]
y_test = y[5000:]

### Fitting and predictions

For faster execution, I am using sklearn here

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [29]:
forest = RandomForestClassifier(n_estimators=100)

In [30]:
forest.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [31]:
forest.score(X_test,y_test)

0.6936333173767353

In [32]:
pred = forest.predict(X_test)
confusion_matrix(pred,y_test)

array([[462, 104,  63],
       [166, 486, 149],
       [ 64,  94, 501]])

Good enough for such a simple analysis