# Data:  airline sentiment kaggle dataset 
## Purpose: train model solely using the text to classify a tweet as positive neutral negative sentiment. Report findings with classification report.

In [1]:
import pandas as pd
import numpy as np
import sqlite3

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer    

In [2]:

# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("./database.sqlite")
df = pd.read_sql_query("SELECT * from Tweets", con)

# verify that result of SQL query is stored in the dataframe
con.close()

In [3]:
df.replace('',np.nan, inplace = True)
df.head(2)


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567588278875213824,neutral,1.0,,,Delta,,JetBlueNews,,0,@JetBlue's new CEO seeks the right balance to ...,,2015-02-16 23:36:05 -0800,USA,Sydney
1,567590027375702016,negative,1.0,Can't Tell,0.6503,Delta,,nesi_1992,,0,@JetBlue is REALLY getting on my nerves !! 😡�...,,2015-02-16 23:43:02 -0800,undecided,Pacific Time (US & Canada)


In [4]:
# clean text of punctuation, url, stopwords    

#stops = set(stopwords.words("english")) 


def clean_text(text):

    text = text.encode('utf-8')
    text = re.sub('https(.)*', '', text, flags=re.MULTILINE)
    text = re.sub('http','', text)
    text = re.sub("[^a-zA-Z]", " ",text)
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    return text
df.text = df.text.apply(lambda val: clean_text(val))

In [5]:
df.text[0:10]

0     JetBlue s new CEO seeks the right balance to ...
1     JetBlue is REALLY getting on my nerves       ...
2     united yes  We waited in line for almost an h...
3     united the we got into the gate at IAH on tim...
4     SouthwestAir its cool that my bags take a bit...
5     united and don t hope for me having a nicer f...
6     united I like delays less than you because I ...
7     united  link to current status of flights air...
8     SouthwestAir you guys there  Are we on hour  ...
9     united I tried   DM it would not go thru    n...
Name: text, dtype: object

In [6]:
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer

#######
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    # tokenize
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems
######## 

In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Stav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
print "Creating the bag of words...\n"
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = tokenize,  
                             preprocessor = None,
                             lowercase = True,
                             stop_words = 'english',   
                             max_features = 1000) 

data_features = vectorizer.fit_transform(df.text)
data_features = data_features.toarray()

vocab = vectorizer.get_feature_names()
print vocab[0:5]

Creating the bag of words...

[u'aa', u'abl', u'abov', u'absolut', u'accept']


In [9]:
import numpy as np

# Sum up the counts of each vocabulary word
dist = np.sum(data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set

word_count_tuple = []
for tag, count in zip(vocab, dist):
    word_count_tuple.append((tag,count))

In [10]:
bag_words_df = pd.DataFrame(word_count_tuple)
bag_words_df.head()

Unnamed: 0,0,1
0,aa,279
1,abl,118
2,abov,17
3,absolut,52
4,accept,54


In [11]:
len(data_features)

14485

In [68]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_features, df.airline_sentiment, test_size = .33)

print X_train.shape, y_train.shape, X_test.shape, y_test.shape

(9704, 1000) (9704,) (4781, 1000) (4781,)


In [32]:
#pd.DataFrame({'importance':forest.feature_importances_, 'vocab':vocab}).sort_values('importance', ascending = False ).set_index('vocab')[0:20].plot(kind = 'bar')

In [36]:
print "gradient boosting..."
from sklearn.ensemble import GradientBoostingClassifier

# Initialize a Random Forest classifier with 100 trees
GBC = GradientBoostingClassifier(loss = 'deviance', learning_rate=.1) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
GBC = GBC.fit(X_train, y_train) 


y_pred = GBC.predict(X_test)

print y_train.value_counts()
print y_test.value_counts()

from sklearn.metrics import classification_report
cls_rep = classification_report(y_test, y_pred)
print cls_rep

gradient boosting...
negative    6083
neutral     2025
positive    1596
Name: airline_sentiment, dtype: int64
negative    2999
neutral     1044
positive     738
Name: airline_sentiment, dtype: int64
             precision    recall  f1-score   support

   negative       0.75      0.94      0.83      2999
    neutral       0.66      0.27      0.38      1044
   positive       0.71      0.58      0.64       738

avg / total       0.72      0.74      0.70      4781



In [37]:
print "logistic regression..."
from sklearn.linear_model import LogisticRegression
# Initialize a Random Forest classifier with 100 trees
logreg = LogisticRegression(penalty='l2')

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
logreg = logreg.fit(X_train, y_train) 


y_pred = logreg.predict(X_test)

print y_train.value_counts()
print y_test.value_counts()

from sklearn.metrics import classification_report
cls_rep = classification_report(y_test, y_pred)
print cls_rep

logistic regression...
negative    6083
neutral     2025
positive    1596
Name: airline_sentiment, dtype: int64
negative    2999
neutral     1044
positive     738
Name: airline_sentiment, dtype: int64
             precision    recall  f1-score   support

   negative       0.83      0.90      0.86      2999
    neutral       0.63      0.52      0.57      1044
   positive       0.72      0.65      0.68       738

avg / total       0.77      0.78      0.77      4781



In [39]:
print "gaussian nb..."
from sklearn.naive_bayes import GaussianNB

# Initialize a Random Forest classifier with 100 trees
GNB = GaussianNB() 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
GNB = GNB.fit(X_train, y_train) 


y_pred = GBC.predict(X_test)

print y_train.value_counts()
print y_test.value_counts()

from sklearn.metrics import classification_report
cls_rep = classification_report(y_test, y_pred)
print cls_rep

gaussian nb...
negative    6083
neutral     2025
positive    1596
Name: airline_sentiment, dtype: int64
negative    2999
neutral     1044
positive     738
Name: airline_sentiment, dtype: int64
             precision    recall  f1-score   support

   negative       0.75      0.94      0.83      2999
    neutral       0.66      0.27      0.38      1044
   positive       0.71      0.58      0.64       738

avg / total       0.72      0.74      0.70      4781



In [50]:
GBC.predict_proba(X_test)[:,2]

array([ 0.03069438,  0.0590613 ,  0.72789674, ...,  0.56672755,
        0.08941444,  0.06930564])

In [30]:
print "random forest..."
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import time 
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [ 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
             "n_jobs":[-1]}

# run grid search
# run randomized search

n_iter_search = 20

clf = RandomForestClassifier()

forest = GridSearchCV(clf, param_grid=param_grid)

from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
#forest = RandomForestClassifier(n_estimators = 500, criterion='gini', n_jobs=-1) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit(X_train, y_train) 


y_pred = forest.best_estimator_.predict(X_test)

print y_train.value_counts()
print y_test.value_counts()

from sklearn.metrics import classification_report
cls_rep = classification_report(y_test, y_pred)
print cls_rep



random forest...
negative    6083
neutral     2025
positive    1596
Name: airline_sentiment, dtype: int64
negative    2999
neutral     1044
positive     738
Name: airline_sentiment, dtype: int64
             precision    recall  f1-score   support

   negative       0.79      0.93      0.85      2999
    neutral       0.64      0.43      0.52      1044
   positive       0.74      0.53      0.62       738

avg / total       0.75      0.76      0.74      4781

