## US Airlines Twitter Sentiment Analysis

We need to categorize the reviews into three:
* Positive
* Negative
* Neutral

We'll be using NLP to preprocess the reviews provided and build a feature set that is compatible with Sklearn using CountVectorizer or TFIDF Vecotrizer, followed by application of different classification algorithms on the dataset, and then pick the best one.

In [96]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import string
from nltk import NaiveBayesClassifier
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB

In [5]:
training_documents = pd.read_csv('training.csv')
testing_documents = pd.read_csv('testing.csv')

In [8]:
training_documents.shape

(10980, 12)

In [9]:
testing_documents.shape

(3660, 11)

In [10]:
training_documents.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [16]:
train_reviews = training_documents.iloc[:, 7]
train_reviews.head()

0    @SouthwestAir I am scheduled for the morning, ...
1    @SouthwestAir seeing your workers time in and ...
2    @united Flew ORD to Miami and back and  had gr...
3       @SouthwestAir @dultch97 that's horse radish 😤🐴
4    @united so our flight into ORD was delayed bec...
Name: text, dtype: object

In [18]:
train_reviews.shape

(10980,)

In [19]:
train_labels = training_documents.iloc[:, 1]
train_labels.head()

0    negative
1    positive
2    positive
3    negative
4    negative
Name: airline_sentiment, dtype: object

In [20]:
train_labels.shape

(10980,)

In [24]:
test_reviews = testing_documents.iloc[:, 6]
test_reviews.head()

0    @AmericanAir In car gng to DFW. Pulled over 1h...
1    @AmericanAir after all, the plane didn’t land ...
2    @SouthwestAir can't believe how many paying cu...
3    @USAirways I can legitimately say that I would...
4    @AmericanAir still no response from AA. great ...
Name: text, dtype: object

In [25]:
test_reviews.shape

(3660,)

In [32]:
stops = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stops.update(punctuation)
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [33]:
# Map pos tag list to simple pos for lemmatizer
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [35]:
lemmatizer = WordNetLemmatizer()

In [40]:
def clean_review(review):
    words = word_tokenize(review)
    output_words = []
    for w in words:
        # Remove stop words
        if w.lower() not in stops:
            # Lemmatize words
            # Get pos of the word
            pos = pos_tag([w])
            # Get root word on basis of pos
            clean_word = lemmatizer.lemmatize(w, pos=get_simple_pos(pos[0][1]))
            # Add clean word to clean review
            output_words.append(clean_word)
    return " ".join(output_words)

In [None]:
# Update train reviews with clean version
train_reviews = [clean_review(review) for review in train_reviews]

In [46]:
train_reviews[0]

'SouthwestAir schedule morning 2 day fact yes..not sure even flight one Cancelled Flightled'

In [47]:
# Update test reviews with clean version
test_reviews = [clean_review(review) for review in test_reviews]

In [48]:
test_reviews[0]

"AmericanAir car gng DFW Pulled 1hr ago icy road On-hold AA since 1hr Ca n't reach arpt AA2450 Wat 2"

In [76]:
count_vec = CountVectorizer(max_features=2000, max_df=0.8, ngram_range=(1,2))

In [77]:
train_reviews_ = count_vec.fit_transform(train_reviews)

In [78]:
train_reviews_.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [79]:
count_vec.get_feature_names()

['00',
 '000',
 '10',
 '10 hour',
 '10 min',
 '10 minute',
 '100',
 '11',
 '12',
 '12 hour',
 '13',
 '14',
 '15',
 '15 min',
 '15 minute',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1hr',
 '1k',
 '1st',
 '1st class',
 '20',
 '20 min',
 '20 minute',
 '200',
 '2015',
 '21',
 '22',
 '23',
 '24',
 '24 hour',
 '24hrs',
 '25',
 '25 min',
 '26',
 '27',
 '2hrs',
 '2nd',
 '2nd time',
 '2x',
 '30',
 '30 min',
 '30 minute',
 '300',
 '32',
 '35',
 '36',
 '3rd',
 '40',
 '40 min',
 '40 minute',
 '400',
 '45',
 '45 min',
 '45 minute',
 '50',
 '50 min',
 '500',
 '60',
 '70',
 '700',
 '728',
 '75',
 '7am',
 '80',
 '800',
 '800 number',
 '90',
 '90 min',
 'a320',
 'aa',
 'able',
 'able get',
 'absolute',
 'absolutely',
 'absurd',
 'accept',
 'acceptable',
 'access',
 'accommodate',
 'account',
 'acct',
 'act',
 'actual',
 'actually',
 'add',
 'additional',
 'address',
 'admiral',
 'advantage',
 'advise',
 'advisory',
 'afford',
 'afternoon',
 'age',
 'agent',
 'agent help',
 'agent say',
 'ago',
 'ah',
 'ahe

In [80]:
test_reviews_ = count_vec.transform(test_reviews)

In [81]:
test_reviews_.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [83]:
rfc = RandomForestClassifier()
rfc

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [89]:
rfc.fit(train_reviews_, train_labels)
y_pred1 = rfc.predict(test_reviews_)
y_pred1

array(['negative', 'negative', 'negative', ..., 'neutral', 'positive',
       'negative'], dtype=object)

In [91]:
np.savetxt('rf.csv', y_pred1, delimiter=",", fmt="%s")

In [106]:
lr = LogisticRegression(C=0.5)
lr

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [107]:
lr.fit(train_reviews_, train_labels)
y_pred2 = lr.predict(test_reviews_)
y_pred2

array(['negative', 'negative', 'negative', ..., 'neutral', 'positive',
       'negative'], dtype=object)

In [108]:
np.savetxt('lr.csv', y_pred2, delimiter=',', fmt="%s")

In [97]:
svc = SVC()
svc

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [98]:
svc.fit(train_reviews_, train_labels)
y_pred3 = svc.predict(test_reviews_)
y_pred3



array(['negative', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [99]:
np.savetxt('svm.csv', y_pred3, delimiter=',', fmt="%s")

In [100]:
nb = MultinomialNB()
nb

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [101]:
nb.fit(train_reviews_, train_labels)
y_pred4 = nb.predict(test_reviews_)
y_pred4

array(['negative', 'negative', 'negative', ..., 'neutral', 'positive',
       'neutral'], dtype='<U8')

In [102]:
np.savetxt('nb.csv', y_pred4, delimiter=',', fmt="%s")