In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
from nltk.stem.porter import *
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn_pandas import DataFrameMapper, cross_val_score

ModuleNotFoundError: No module named 'wordcloud'

## Reading and exploring tweets

In [None]:
tweets_df = pd.read_csv("Tweets.csv")

In [None]:
tweets_df.info()

In [None]:
tweets_df.describe(include='all')

In [None]:
tweets_df.tail()

### Exploring categorical data

In [None]:
print (tweets_df['airline_sentiment'].value_counts())
ax = sns.catplot(x='airline_sentiment',kind='count',data=tweets_df)
ax.fig.autofmt_xdate()

In [None]:
tweets_df['tweet_location'].value_counts()[:10]

In [40]:
tweets_df['user_timezone'].value_counts()[:10]

None                           4740
Eastern Time (US & Canada)     3700
Central Time (US & Canada)     1881
Pacific Time (US & Canada)     1184
Quito                           712
Atlantic Time (Canada)          491
Mountain Time (US & Canada)     351
Arizona                         225
London                          187
Sydney                          107
Name: user_timezone, dtype: int64

In [41]:
tweets_df['name'].value_counts()[:10]

JetBlueNews        63
kbosspotter        32
_mhertz            29
otisday            28
throthra           27
rossj987           23
GREATNESSEOA       22
scoobydoo9749      21
weezerandburnie    21
MeeestarCoke       20
Name: name, dtype: int64

## Tweets pre-processing

### Remove duplicates

In [42]:
tweets_df.shape

(14366, 9)

In [43]:
tweets_df.sort_values("text", inplace=True)
tweets_df.drop_duplicates(keep=False,inplace=True)

In [44]:
tweets_df.shape

(14366, 9)

### Handling nulls values

In [45]:
# find nan values
print(tweets_df[~tweets_df.name.notnull()].shape)
print(tweets_df[~tweets_df.text.notnull()].shape)
print(tweets_df[~tweets_df.tweet_location.notnull()].shape)
print(tweets_df[~tweets_df.user_timezone.notnull()].shape)

(0, 9)
(0, 9)
(0, 9)
(0, 9)


In [46]:
tweets_df.shape

(14366, 9)

In [47]:
tweets_df.tweet_location = tweets_df.tweet_location.fillna('None')
tweets_df.user_timezone = tweets_df.user_timezone.fillna('None')

#columns = ['text', 'name', 'tweet_location', 'user_timezone']
#tweets_df = tweets_df.dropna(subset=columns)

In [48]:
print(tweets_df[~tweets_df.tweet_location.notnull()].shape)
print(tweets_df[~tweets_df.user_timezone.notnull()].shape)
print(tweets_df.shape)

(0, 9)
(0, 9)
(14366, 9)


In [49]:
tweets_df[tweets_df.tweet_location=='None'].head(1)

Unnamed: 0,tweet_id,airline_sentiment,name,text,tweet_coord,tweet_created,tweet_location,user_timezone,tidy_tweet
4978,5.69587e+17,negative,danihampton,&lt;3 &lt;3 RT @SouthwestAir! @danihampton Sor...,,2/22/2015 11:58,,Arizona,sorri hear about wifi connect dani pleas your ...


### Removing Twitter Handles (@user) 

In [50]:
tweets_df['tidy_tweet'] = tweets_df['text'].replace(r'@[\w]*', '', regex=True)
tweets_df.head()

Unnamed: 0,tweet_id,airline_sentiment,name,text,tweet_coord,tweet_created,tweet_location,user_timezone,tidy_tweet
7064,5.69943e+17,positive,SeanUppercut,"""LOL you guys are so on it"" - me, had this bee...",,2/23/2015 11:31,"Detroit, MI",Quito,"""LOL you guys are so on it"" - me, had this bee..."
7008,5.69959e+17,neutral,UAKShine,#Real RT @JetBlue: Our fleet's on fleek. http:...,,2/23/2015 12:34,Bushmansted,Quito,#Real RT : Our fleet's on fleek. http://t.co/E...
14272,5.69643e+17,negative,sarrraright,#nothelpful MT @AmericanAir: Our call volume i...,,2/22/2015 15:42,Cathedral Heights,Eastern Time (US & Canada),#nothelpful MT : Our call volume is extremely ...
4978,5.69587e+17,negative,danihampton,&lt;3 &lt;3 RT @SouthwestAir! @danihampton Sor...,,2/22/2015 11:58,,Arizona,&lt;3 &lt;3 RT ! Sorry to hear about the WiFi...
7092,5.69938e+17,neutral,MelechT,*On the brink of bankruptcy. “@JetBlue: Our fl...,,2/23/2015 11:14,DMV,Central Time (US & Canada),*On the brink of bankruptcy. “: Our fleet's on...


### Removing Punctuations, Numbers, and Special Characters

In [51]:
tweets_df['tidy_tweet'] = tweets_df['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [52]:
tweets_df.head()

Unnamed: 0,tweet_id,airline_sentiment,name,text,tweet_coord,tweet_created,tweet_location,user_timezone,tidy_tweet
7064,5.69943e+17,positive,SeanUppercut,"""LOL you guys are so on it"" - me, had this bee...",,2/23/2015 11:31,"Detroit, MI",Quito,LOL you guys are so on it me had this bee...
7008,5.69959e+17,neutral,UAKShine,#Real RT @JetBlue: Our fleet's on fleek. http:...,,2/23/2015 12:34,Bushmansted,Quito,#Real RT Our fleet s on fleek http t co E...
14272,5.69643e+17,negative,sarrraright,#nothelpful MT @AmericanAir: Our call volume i...,,2/22/2015 15:42,Cathedral Heights,Eastern Time (US & Canada),#nothelpful MT Our call volume is extremely ...
4978,5.69587e+17,negative,danihampton,&lt;3 &lt;3 RT @SouthwestAir! @danihampton Sor...,,2/22/2015 11:58,,Arizona,lt lt RT Sorry to hear about the WiFi...
7092,5.69938e+17,neutral,MelechT,*On the brink of bankruptcy. “@JetBlue: Our fl...,,2/23/2015 11:14,DMV,Central Time (US & Canada),On the brink of bankruptcy Our fleet s on...


### Removing Short Words

In [53]:
tweets_df['tidy_tweet'] = tweets_df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [54]:
tweets_df.head()

Unnamed: 0,tweet_id,airline_sentiment,name,text,tweet_coord,tweet_created,tweet_location,user_timezone,tidy_tweet
7064,5.69943e+17,positive,SeanUppercut,"""LOL you guys are so on it"" - me, had this bee...",,2/23/2015 11:31,"Detroit, MI",Quito,guys this been months fleet fleek http LYcARlTFHl
7008,5.69959e+17,neutral,UAKShine,#Real RT @JetBlue: Our fleet's on fleek. http:...,,2/23/2015 12:34,Bushmansted,Quito,#Real fleet fleek http ERzht
14272,5.69643e+17,negative,sarrraright,#nothelpful MT @AmericanAir: Our call volume i...,,2/22/2015 15:42,Cathedral Heights,Eastern Time (US & Canada),#nothelpful call volume extremely high today a...
4978,5.69587e+17,negative,danihampton,&lt;3 &lt;3 RT @SouthwestAir! @danihampton Sor...,,2/22/2015 11:58,,Arizona,Sorry hear about WiFi connection Dani Please y...
7092,5.69938e+17,neutral,MelechT,*On the brink of bankruptcy. “@JetBlue: Our fl...,,2/23/2015 11:14,DMV,Central Time (US & Canada),brink bankruptcy fleet fleek http ldxn


### Tokenization

In [55]:
tokenized_tweets = tweets_df['tidy_tweet'].apply(lambda x: x.split())

In [56]:
tokenized_tweets.head()

7064     [guys, this, been, months, fleet, fleek, http,...
7008                    [#Real, fleet, fleek, http, ERzht]
14272    [#nothelpful, call, volume, extremely, high, t...
4978     [Sorry, hear, about, WiFi, connection, Dani, P...
7092         [brink, bankruptcy, fleet, fleek, http, ldxn]
Name: tidy_tweet, dtype: object

### Stemming 

In [57]:
stemmer = PorterStemmer()

tokenized_tweets = tokenized_tweets.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweets.head()

7064     [guy, thi, been, month, fleet, fleek, http, ly...
7008                    [#real, fleet, fleek, http, erzht]
14272    [#nothelp, call, volum, extrem, high, today, a...
4978     [sorri, hear, about, wifi, connect, dani, plea...
7092         [brink, bankruptci, fleet, fleek, http, ldxn]
Name: tidy_tweet, dtype: object

In [58]:
# Aggregate tokens together again

tokenized_tweets = [' '.join(token) for token in tokenized_tweets]

In [59]:
tweets_df['tidy_tweet'] = tokenized_tweets

In [60]:
tweets_df.head()

Unnamed: 0,tweet_id,airline_sentiment,name,text,tweet_coord,tweet_created,tweet_location,user_timezone,tidy_tweet
7064,5.69943e+17,positive,SeanUppercut,"""LOL you guys are so on it"" - me, had this bee...",,2/23/2015 11:31,"Detroit, MI",Quito,guy thi been month fleet fleek http lycarltfhl
7008,5.69959e+17,neutral,UAKShine,#Real RT @JetBlue: Our fleet's on fleek. http:...,,2/23/2015 12:34,Bushmansted,Quito,#real fleet fleek http erzht
14272,5.69643e+17,negative,sarrraright,#nothelpful MT @AmericanAir: Our call volume i...,,2/22/2015 15:42,Cathedral Heights,Eastern Time (US & Canada),#nothelp call volum extrem high today apolog p...
4978,5.69587e+17,negative,danihampton,&lt;3 &lt;3 RT @SouthwestAir! @danihampton Sor...,,2/22/2015 11:58,,Arizona,sorri hear about wifi connect dani pleas your ...
7092,5.69938e+17,neutral,MelechT,*On the brink of bankruptcy. “@JetBlue: Our fl...,,2/23/2015 11:14,DMV,Central Time (US & Canada),brink bankruptci fleet fleek http ldxn


## Exploring Tweets

## what are the common words? 

In [61]:
# All Common words 
all_words = ' '.join([text for text in tweets_df['tidy_tweet']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

NameError: name 'WordCloud' is not defined

In [None]:
# All Positive words 
all_words = ' '.join([text for text in tweets_df['tidy_tweet'][tweets_df['airline_sentiment'] == 'positive']])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# All Negative words 
all_words = ' '.join([text for text in tweets_df['tidy_tweet'][tweets_df['airline_sentiment'] == 'negative']])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## Create test set

In [None]:
train_set, test_set = train_test_split(tweets_df, test_size=0.2, random_state=42)

In [None]:
len(train_set)

In [None]:
len(test_set)

## Feature engineering

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 3), stop_words='english')
features = tfidf.fit_transform(tweets_df['tidy_tweet']).toarray()
labels = tweets_df['airline_sentiment']
features.shape

## Building the model

### Vectorization

In [None]:
mapper = DataFrameMapper([
     ('tidy_tweet', TfidfVectorizer(norm='l2', ngram_range=(1, 1), stop_words='english')),
     ('name', TfidfVectorizer()),
     ('tweet_location', TfidfVectorizer()),
     ('user_timezone', TfidfVectorizer()),
 ])

features = mapper.fit_transform(train_set)

### Create Evaluation set

In [None]:
categories = train_set.airline_sentiment

x, x_eval, y, y_eval = train_test_split(features,categories,test_size=0.2,train_size=0.8, random_state = 0)

In [None]:
# Check Accuracy
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 


def print_accuracy():
    y_pred = clf.predict(x_eval)

    from sklearn.metrics import confusion_matrix
    conf_mat = confusion_matrix(y_eval, y_pred)

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(conf_mat, annot=True, fmt='d',
                xticklabels=tweets_df.airline_sentiment.unique(), 
                yticklabels=tweets_df.airline_sentiment.unique())
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    
    print('Accuracy Score :',accuracy_score(y_eval, y_pred))
    print('')
    print(classification_report(y_eval, y_pred, target_names=tweets_df.airline_sentiment.unique()))

### RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
clf.fit(x, y)
print_accuracy()

### Naiive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(x, y)
print_accuracy()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(x, y)
print_accuracy()

### SVM

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC().fit(x, y)
print_accuracy()