# Twitter US Airline Sentiment Analysis

Given Twitter US Airline Sentiment Dataset, which contains data for over 14000 tweets,

your task:

  is to predict the sentiment of the tweet i.e. positive, negative or neutral.

You are given:
1. A Training dataset csv file with X train and Y train data
2. A X test File and you have to predict and submit predictions for this file.

In [1]:
# importing dataset
import pandas as pd
import numpy as np
import time

In [2]:
starting = time.time()

In [3]:
training_df = pd.read_csv("training_twitter_x_y_train.csv")
testing_df = pd.read_csv("test_twitter_x_test.csv")

In [4]:
training_df.head(3)

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)


In [5]:
testing_df.head(3)

Unnamed: 0,tweet_id,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569682010270101504,American,,zsalim03,,0,@AmericanAir In car gng to DFW. Pulled over 1h...,,2015-02-22 18:15:50 -0800,Texas,Central Time (US & Canada)
1,569608307184242688,American,,sa_craig,,0,"@AmericanAir after all, the plane didn’t land ...",,2015-02-22 13:22:57 -0800,"College Station, TX",Central Time (US & Canada)
2,567879304593408001,Southwest,,DanaChristos,,1,@SouthwestAir can't believe how many paying cu...,,2015-02-17 18:52:31 -0800,CT,Eastern Time (US & Canada)


In [6]:
from nltk.tokenize import word_tokenize

In [7]:
x_train_data = training_df['text']
y_train_data = training_df['airline_sentiment']
x_test_data = testing_df['text']

In [8]:
x_y_train_documents = []
for i in range(len(x_train_data)):
    document = word_tokenize(x_train_data[i])
    category = y_train_data[i]
    x_y_train_documents.append((document, category))

In [9]:
x_test_documents = []
for i in range(len(x_test_data)):
    document = word_tokenize(x_test_data[i])
    x_test_documents.append((document))

In [10]:
print(x_y_train_documents[0])
print('----------------------------------------------------------------------')
print(x_test_documents[0])

(['@', 'SouthwestAir', 'I', 'am', 'scheduled', 'for', 'the', 'morning', ',', '2', 'days', 'after', 'the', 'fact', ',', 'yes', '..', 'not', 'sure', 'why', 'my', 'evening', 'flight', 'was', 'the', 'only', 'one', 'Cancelled', 'Flightled'], 'negative')
----------------------------------------------------------------------
['@', 'AmericanAir', 'In', 'car', 'gng', 'to', 'DFW', '.', 'Pulled', 'over', '1hr', 'ago', '-', 'very', 'icy', 'roads', '.', 'On-hold', 'with', 'AA', 'since', '1hr', '.', 'Ca', "n't", 'reach', 'arpt', 'for', 'AA2450', '.', 'Wat', '2', 'do', '?']


In [11]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
import string
from nltk import pos_tag
lemmatizer = WordNetLemmatizer()

In [12]:
def get_simple_pos(tag):
    if tag.startswith("J"):
        return wordnet.ADJ
    elif tag.startswith("v"):
        return wordnet.VERB
    elif tag.startswith("N"):
        return wordnet.NOUN
    elif tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [13]:
print(pos_tag(["better"]))
print(get_simple_pos("R"))

[('better', 'RBR')]
r


In [14]:
stop = stopwords.words("english")
punctuations = list(string.punctuation)
stop = stop + punctuations
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
def clean_review(words):     # array -> words
    output_words = []
    for w in words:
        if w.lower() not in stop:
            # lemmatize word
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, get_simple_pos(pos[0][1]))
            output_words.append(clean_word)
    return output_words

In [16]:
start = time.time()
training_documents = [(clean_review(document),category) for document,category in x_y_train_documents]
testing_documents = [(clean_review(document)) for document in x_test_documents]
end = time.time()
print(end - start)

249.95573449134827


In [17]:
print(training_documents[0:2])

[(['SouthwestAir', 'scheduled', 'morning', '2', 'day', 'fact', 'yes', '..', 'sure', 'evening', 'flight', 'one', 'Cancelled', 'Flightled'], 'negative'), (['SouthwestAir', 'seeing', 'worker', 'time', 'time', 'going', 'beyond', 'love', 'flying', 'guy', 'Thank'], 'positive')]


In [18]:
print(testing_documents[0:2])

[['AmericanAir', 'car', 'gng', 'DFW', 'Pulled', '1hr', 'ago', 'icy', 'road', 'On-hold', 'AA', 'since', '1hr', 'Ca', "n't", 'reach', 'arpt', 'AA2450', 'Wat', '2'], ['AmericanAir', 'plane', '’', 'land', 'identical', 'bad', 'condition', 'GRK', 'according', 'METARs']]


In [19]:
# let's create feature
all_words = []
for doc in training_documents:
    all_words += doc[0]

In [20]:
len(all_words)

120030

In [21]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(5000)
common[:5]

[('united', 2928),
 ('flight', 2806),
 ('AmericanAir', 2208),
 ('USAirways', 2173),
 ('SouthwestAir', 1801)]

In [22]:
features = [i[0] for i in common]
len(features)

5000

In [23]:
# now we need cover list of word into one str
x_train_data = [" ".join(document) for document, category in training_documents]
y_train_data = [category for document, category in training_documents]
y_train_data = np.array(y_train_data)
x_test_data = [" ".join(document) for document in testing_documents]

In [24]:
print(x_train_data[0])

SouthwestAir scheduled morning 2 day fact yes .. sure evening flight one Cancelled Flightled


In [25]:
print(x_test_data[0])

AmericanAir car gng DFW Pulled 1hr ago icy road On-hold AA since 1hr Ca n't reach arpt AA2450 Wat 2


In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
# feature extraction
count_vec = CountVectorizer(max_features = 10000, ngram_range = (1,2), max_df = 0.7)
x_train_features = count_vec.fit_transform(x_train_data)
x_test_features = count_vec.transform(x_test_data)

In [39]:
x_train_features.shape, y_train_data.shape, x_test_features.shape

((10980, 10000), (10980,), (3660, 10000))

In [40]:
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [41]:
y_train_data

array(['negative', 'positive', 'positive', ..., 'negative', 'negative',
       'negative'], dtype='<U8')

In [42]:
count_vec.get_feature_names()

['00',
 '000',
 '000 foot',
 '000 mile',
 '0016',
 '00pm',
 '02',
 '03',
 '03 03',
 '05',
 '05pm',
 '10',
 '10 000',
 '10 15',
 '10 24',
 '10 30',
 '10 day',
 '10 hour',
 '10 hr',
 '10 min',
 '10 minute',
 '10 year',
 '100',
 '100 people',
 '1000',
 '10a',
 '10am',
 '10hrs',
 '10pm',
 '11',
 '11 30',
 '11 30pm',
 '11 hr',
 '11am',
 '11th',
 '12',
 '12 hour',
 '12 hr',
 '1230',
 '13',
 '13 hour',
 '130',
 '1359',
 '136',
 '13th',
 '14',
 '14 hour',
 '140',
 '140 character',
 '15',
 '15 hour',
 '15 min',
 '15 minute',
 '15 year',
 '150',
 '152',
 '1583',
 '15th',
 '16',
 '17',
 '18',
 '18 hour',
 '180',
 '1800',
 '1898',
 '19',
 '1hr',
 '1hr delay',
 '1k',
 '1k status',
 '1pm',
 '1st',
 '1st bag',
 '1st class',
 '1st flight',
 '1st time',
 '20',
 '20 flight',
 '20 min',
 '20 minute',
 '20 pbi',
 '20 people',
 '200',
 '200 change',
 '200 dollar',
 '200 fee',
 '200 take',
 '2000',
 '2012',
 '2014',
 '2015',
 '20min',
 '21',
 '21 feb',
 '21st',
 '22',
 '22 keep',
 '23',
 '24',
 '24 hour',
 

## Train Support Vector Machine model

In [43]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features, y_train_data)
svc.score(x_train_features, y_train_data)

0.916120218579235

## Train Random Forest model

In [44]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train_features, y_train_data)
rfc.score(x_train_features, y_train_data)

0.9941712204007286

### Prediction

In [45]:
y_pred = rfc.predict(x_test_features)
y_pred

array(['negative', 'negative', 'negative', ..., 'neutral', 'positive',
       'negative'], dtype='<U8')

In [46]:
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.head()

Unnamed: 0,0
0,negative
1,negative
2,negative
3,negative
4,negative


2. Submit a csv file with only predictions for X test data. File should not have any headers and should only have one column i.e. predictions. 

In [47]:
y_pred_df.to_csv("Y_Predicted.csv", header = False, index = False)

In [48]:
ending = time.time()
print( ending - starting)

433.11227536201477
