In [134]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
import re
import numpy as np

In [135]:
tweets_with_originals = pd.read_csv('tweets_with_originals.csv', parse_dates = ['tweet_created'])

In [136]:
tweets_with_originals.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text_proc', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone', 'text_orig', 'std_user_timezone',
       'text_clean'],
      dtype='object')

In [137]:
tweets_with_originals.columns
tweets_with_originals['text_clean'] = tweets_with_originals.text_clean.str.replace(r'@(\w+)([\s.,:;!])?', r'\1 ') # remove @s in @mentions

In [138]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
import string
#stopwords = stopwords.union(set([i for i in string.punctuation])) # remove punctuation
#stopwords = stopwords.union(set(['AmericanAir', 'United', 'USAirways', 'JetBlue', 'SouthwestAir', 'Delta', 'VirginAmerica']))

In [139]:
tweets_with_originals.loc[55].text_clean

"VirginAmerica hi! i'm so excited about your $99 LGA->DAL deal- but i've been trying 2 book since last week & the page never loads. thanks!"

# TF-IDF to identify additional stopwords 

Using TF-IDF, we'll look for additional stopwords 

In [140]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = stopwords, strip_accents = 'unicode', min_df = 10)
tweet_input = tweets_with_originals.text_clean
# maybe try to take out numbers, prices 
tweet_input=tweet_input.str.replace(r" (\d|\W)+","") # remove digits, nonword things
T = tfidf_vectorizer.fit_transform(tweet_input) # these are our features

In [141]:
idf_df = pd.DataFrame(index = tfidf_vectorizer.get_feature_names())
idf_df['idf_vals']= tfidf_vectorizer.idf_
idf_df.describe()

Unnamed: 0,idf_vals
count,1621.0
mean,7.121038
std,0.961845
min,2.420606
25%,6.609403
50%,7.360819
75%,7.871644
max,8.181799


The minimum inverse document frequency is not so low. For our first pass I won't add any stopwords based on IDF values. 

We can use this array of tfidf vectors as input features... for testing purposes we'll also allow 2-grams and 3-grams to see if this improves the result. 

In [142]:
tf2_vectorizer = TfidfVectorizer(stop_words = stopwords, strip_accents = 'unicode', min_df = 10, ngram_range = (1,2))
tf3_vectorizer = TfidfVectorizer(stop_words = stopwords, strip_accents = 'unicode', min_df = 10, ngram_range = (1,3))
T_2 = tf2_vectorizer.fit_transform(tweet_input) 
T_3 = tf3_vectorizer.fit_transform(tweet_input)

# Word2Vec Model 

First we'll train a word2vec model on our tweets and see what sorts of words are grouped together. 

In [176]:
from nltk.tokenize.casual import TweetTokenizer
from gensim.models.word2vec import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
path = get_tmpfile('airlinetweet_word2vec.model')
tokenizer = TweetTokenizer(preserve_case=False)
tokenized_input = [tokenizer.tokenize(tweet) for tweet in tweet_input.values]
tokenized_input = [[word for word in tweet if word not in string.punctuation] for tweet in tokenized_input]
model = Word2Vec(tokenized_input, window=20, min_count=10, workers=2) # build vocabulary
model.save('airlinetweet_word2vec.model')
model.train(tokenized_input, total_examples=len(tokenized_input), epochs=10)

(1443050, 2335470)

The results look ok, airlines are associated with other airlines...

In [177]:
model.wv.most_similar('united')

[('southwestair', 0.5815246105194092),
 ('jetblue', 0.5388919711112976),
 ('americanair', 0.4240928888320923),
 ('claim', 0.3698948919773102),
 ('that', 0.3682762086391449),
 ('virginamerica', 0.35809794068336487),
 ('located', 0.3502815365791321),
 ('unitedi', 0.3233298361301422),
 ('bag', 0.31977206468582153),
 ('fixed', 0.31066328287124634)]

In [178]:
model.wv.most_similar('usairways')

[('americanair', 0.557052493095398),
 ('jetblue', 0.43893134593963623),
 ('usairwayshours', 0.38478702306747437),
 ('fordays', 0.3600230813026428),
 ('usairwaysi', 0.35849136114120483),
 ('reservations', 0.35133421421051025),
 ('inhours', 0.35006681084632874),
 ('andhours', 0.34855103492736816),
 ('and', 0.3438708186149597),
 ('long', 0.3432450592517853)]

And TSA with security check type things such as passes (probably "boarding pass"), ids, etc. 

In [179]:
model.wv.most_similar('tsa')

[('pre', 0.8186845183372498),
 ('correct', 0.779934823513031),
 ('laptop', 0.7408335208892822),
 ('file', 0.7391449213027954),
 ('id', 0.7343534231185913),
 ('tracking', 0.7276106476783752),
 ('name', 0.7256952524185181),
 ('enter', 0.7245402932167053),
 ('mobile', 0.7213053107261658),
 ('clearly', 0.7211697101593018)]

# Sentiment Analysis

In [181]:
# test split with oversampling ... try SMOTE 
#from imblearn.over_sampling import SMOTE

## Method 1: TF-IDF output

## Method 2: Doc2Vec
(we'll use the gensim implementation)

Doc2Vec learns "document" vectors instead of word vectors. 

In [169]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
sentiment_tweets_docs = [TaggedDocument(tokenized_input[i], str(tweets_with_originals.tweet_id.iloc[i])) for i in range(tweet_input.shape[0])]

In [170]:
model_doc2vec = Doc2Vec(size=300, min_count=0)
model_doc2vec.build_vocab(sentiment_tweets_docs)
model_doc2vec.save('airplane')

