# Importing the Required Libraries

In [186]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from nltk import pos_tag
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Preparing Training Data

# Importing Training Data

In [187]:
df = pd.read_csv('twitter train.csv')

In [188]:
df = df[['text', 'airline_sentiment']]

In [189]:
df

Unnamed: 0,text,airline_sentiment
0,"@SouthwestAir I am scheduled for the morning, ...",negative
1,@SouthwestAir seeing your workers time in and ...,positive
2,@united Flew ORD to Miami and back and had gr...,positive
3,@SouthwestAir @dultch97 that's horse radish 😤🐴,negative
4,@united so our flight into ORD was delayed bec...,negative
...,...,...
10975,@AmericanAir followback,neutral
10976,@united thanks for the help. Wish the phone re...,positive
10977,@usairways the. Worst. Ever. #dca #customerser...,negative
10978,@nrhodes85: look! Another apology. DO NOT FLY ...,negative


In [190]:
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]  

In [191]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.30, random_state=45)

In [192]:
print(train_features.describe())
print(train_labels.describe())
print(test_features.describe())
print(test_labels.describe())

                  text
count             7686
unique            7617
top     @united thanks
freq                 5
count         7686
unique           3
top       negative
freq          4800
Name: airline_sentiment, dtype: object
                       text
count                  3294
unique                 3281
top     @JetBlue thank you!
freq                      3
count         3294
unique           3
top       negative
freq          2051
Name: airline_sentiment, dtype: object


In [193]:
training_data = pd.concat([train_features, train_labels], axis=1)
training_data = training_data.values
training_data

array([["@JetBlue  I shouldn't have to find them, they should tell us. I've flown Jet Blue since your first month. The experience isn't what it was.",
        'negative'],
       ['@SouthwestAir is there a resource to check delays/Cancelled Flightlations out of Love Field? Flying out tomorrow am and stressed about weather! ❄️',
        'neutral'],
       ["@SouthwestAir I figured the streaming wouldn't work per the TOS but just the @NASCAR site is taking longer than 5 minutes to load",
        'negative'],
       ...,
       ['@SouthwestAir has some of the best airfare prices! Gotta LUV them :)',
        'positive'],
       ['@SouthwestAir even with the 50$ voucher for picking up my bag',
        'negative'],
       ['@united sat at airport for 5 hrs still sitting at gate..  Sigh',
        'negative']], dtype=object)

# Splitting the Tweet text into words using NLTK

In [194]:
tweets_train = []
for i in range(len(training_data)):
    tweets_train.append([word_tokenize(training_data[i][0]), training_data[i][1]]) 
tweets_train

[[['@',
   'JetBlue',
   'I',
   'should',
   "n't",
   'have',
   'to',
   'find',
   'them',
   ',',
   'they',
   'should',
   'tell',
   'us',
   '.',
   'I',
   "'ve",
   'flown',
   'Jet',
   'Blue',
   'since',
   'your',
   'first',
   'month',
   '.',
   'The',
   'experience',
   'is',
   "n't",
   'what',
   'it',
   'was',
   '.'],
  'negative'],
 [['@',
   'SouthwestAir',
   'is',
   'there',
   'a',
   'resource',
   'to',
   'check',
   'delays/Cancelled',
   'Flightlations',
   'out',
   'of',
   'Love',
   'Field',
   '?',
   'Flying',
   'out',
   'tomorrow',
   'am',
   'and',
   'stressed',
   'about',
   'weather',
   '!',
   '❄️'],
  'neutral'],
 [['@',
   'SouthwestAir',
   'I',
   'figured',
   'the',
   'streaming',
   'would',
   "n't",
   'work',
   'per',
   'the',
   'TOS',
   'but',
   'just',
   'the',
   '@',
   'NASCAR',
   'site',
   'is',
   'taking',
   'longer',
   'than',
   '5',
   'minutes',
   'to',
   'load'],
  'negative'],
 [['@',
   'America

Cleaning the Words using WordNetLemmatizer available in NLTK

In [195]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [196]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [197]:
lemmatizer = WordNetLemmatizer()
def clean_tweets(words):
    output_words = []
    for w in words:
        if w.isalpha():
            if w.lower() not in stops:
                pos = pos_tag([w])
                clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
                output_words.append(clean_word.lower())
    return output_words

In [198]:
for i in range(len(tweets_train)):
    tweets_train[i] = (clean_tweets(tweets_train[i][0]), tweets_train[i][1])

In [199]:
y_train = []
tweets = []
for tweet, sentiment in tweets_train:
    tweets.append(" ".join(tweet))
    y_train.append(sentiment)

# Using Count Vectorizer to get the X Train

In [200]:
count_vec = CountVectorizer(max_features=2000) # Tried using n grams but the accuracy was decreasing
x_train_features = count_vec.fit_transform(tweets)

# Preparing Testing Data

In [201]:
test_features

Unnamed: 0,text
7540,@SouthwestAir thank you so much completely mad...
3175,@VirginAmerica Is flight 0769 out of LGA to DF...
6320,@AmericanAir well Done all of you xx
6745,@JetBlue @weepysweetmonty i heard youre planni...
581,"@united, if you mean beyond opaque ""maintenanc..."
...,...
9897,@USAirways Thank you!!! On our way to get her ...
3588,@SouthwestAir from groupA to group C. We have ...
7218,@AmericanAir Rebooked for tomorrow morning. Ne...
2724,@united my son is a passenger on flight 3710 f...


In [202]:
testing_data = np.array(test_features['text'])
testing_data

array(['@SouthwestAir thank you so much completely made things right!',
       '@VirginAmerica Is flight 0769 out of LGA to DFW on time?',
       '@AmericanAir well Done all of you xx', ...,
       '@AmericanAir Rebooked for tomorrow morning. Never been here - not sure what I can see before tomorrow morning!',
       '@united my son is a passenger on flight 3710 from Chicago to Toronto. The plane came within feet of colliding  another plane and is stopped.',
       '@united hey I left my favorite blanket on the plane bring it back home to me:-('],
      dtype=object)

In [203]:
tweets_test = []
for t in testing_data:
    t = clean_tweets(word_tokenize(t))
    tweets_test.append(" ".join(t))

In [204]:
tweets_test

['southwestair thank much completely make thing right',
 'virginamerica flight lga dfw time',
 'americanair well done xx',
 'jetblue weepysweetmonty heard youre planning let people fly overhead compartment would',
 'united mean beyond opaque maintenance issue yes would basic customer service ewr staff ca manage',
 'jetblue want fly storm right give u choice kid want chance nixchangefees',
 'southwestair hold hour minute cust service help',
 'jetblue sure email screenshot link http http',
 'southwestair min away',
 'usairways right end sight',
 'united miss amaze u bank work conference',
 'southwestair landed hour late flight ind den min late flightr bag cool',
 'jetblue pretty nice flight credit automatically give flight also wish lounge could sleep',
 'united appreciate sentiment able get ground still miss connection',
 'usairways told return call human',
 'southwestair thanks good air side caution',
 'jetblue oh yeah great flight mexico wonderful crew thank',
 'united say would need 

In [205]:
x_test_features = count_vec.transform(tweets_test)

# Performing Classification

# Support Vector Machine

In [206]:
svc = SVC()
svc.fit(x_train_features, y_train)

In [207]:
y_pred_svm = svc.predict(x_test_features)

In [208]:
accuracy_svm = accuracy_score(test_labels, y_pred_svm)

In [209]:
accuracy_svm

0.7738312082574378

In [210]:
df = pd.DataFrame(y_pred_svm)
df.to_csv('predictions_svm.csv', index = False, header = False)

# Random Forest

In [211]:
rf = RandomForestClassifier()
rf.fit(x_train_features, y_train)

In [212]:
y_pred_rf = rf.predict(x_test_features)

In [213]:
accuracy_rf = accuracy_score(test_labels, y_pred_rf)
accuracy_rf

0.7544019429265331

In [214]:
df = pd.DataFrame(y_pred_rf)
df.to_csv('predictions_rf.csv', index = False, header = False)

# Multinomial Naive Bayes

In [215]:
mnv = MultinomialNB(alpha = 1)
mnv.fit(x_train_features, y_train)

In [216]:
y_pred_mnv = mnv.predict(x_test_features)

In [217]:
accuracy_mnv = accuracy_score(test_labels, y_pred_mnv)
accuracy_mnv

0.7577413479052824

In [218]:
df = pd.DataFrame(y_pred_mnv)
df.to_csv('predictions_mnv.csv', index = False, header = False)

# Descision Tree

In [219]:
dt = tree.DecisionTreeClassifier()
dt.fit(x_train_features, y_train)

In [220]:
y_pred_dt = dt.predict(x_test_features)

In [221]:
accuracy_dt = accuracy_score(test_labels, y_pred_dt)
accuracy_dt

0.678809957498482

In [222]:
df = pd.DataFrame(y_pred_dt)
df.to_csv('predictions_dt.csv', index = False, header = False)