#**Twitter Sentiment Analysis : US Airline**

## **Importing requires libraries**

In [1]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords,wordnet
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
nltk.download('stopwords') 
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shivanshsharma/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shivanshsharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shivanshsharma/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shivanshsharma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## **Reading the training and testing csv files**

In [3]:
training = pd.read_csv("training_twitter_x_train.csv")
testing = pd.read_csv("twitter_x_test.csv")
training.shape, testing.shape

((10980, 12), (3660, 11))

## **Cleaning up the training data**

In [4]:
df_train = training.copy()
df_test = testing.copy()
Negtive = len(df_train.airline_sentiment[df_train.airline_sentiment == 'negative'])
Positive = len(df_train.airline_sentiment[df_train.airline_sentiment == 'positive'])
Neutral = len(df_train.airline_sentiment[df_train.airline_sentiment == 'neutral'])
Positive,Negtive,Neutral

(1802, 6851, 2327)

### **Exracting the tweets and sentiments of training data**

In [5]:
training_tweets=df_train['text'].values
sentiments=df_train['airline_sentiment'].values

In [6]:
training_tweets.shape,sentiments.shape

((10980,), (10980,))

### **Tokenizing the each tweet and extracting the sentiments**


In [7]:
train_docs=[]
for i in range(len(training_tweets)):
    train_docs.append((word_tokenize(training_tweets[i]),sentiments[i]))

In [8]:
# data is in tuple form with 1 list of tokenize words and other with sentiments
train_docs[0]

(['@',
  'SouthwestAir',
  'I',
  'am',
  'scheduled',
  'for',
  'the',
  'morning',
  ',',
  '2',
  'days',
  'after',
  'the',
  'fact',
  ',',
  'yes',
  '..',
  'not',
  'sure',
  'why',
  'my',
  'evening',
  'flight',
  'was',
  'the',
  'only',
  'one',
  'Cancelled',
  'Flightled'],
 'negative')

In [9]:
# Funtion for converting pos_tag into the form feasible for lemmatizer
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

### **Creating a list of stopwords and punctuations**

In [10]:
# Creating list which  
stops=set(stopwords.words('english'))
punctuations=list(string.punctuation)
stops.update(punctuations)

In [11]:
# object for lemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
# function to clean each tweets
def clean_tweet(words):

    # Created empty a list to store the clean tweet words
    output_words=[]

    # traversing through each word 
    # Ignoring the stopwords and punctuations
    # Checking whether the word contains only alphabets or not

    for w in words:
        if w.lower() not in stops and w.lower().isalpha():
          
            # calculating pos tag and applying lemmatizer 
            pos=pos_tag([w])
            clean_word=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words


In [13]:
# Applying the clean_tweet function on training data
train_docs = [(clean_tweet(tweet),sentiment) for tweet,sentiment in train_docs ]

In [14]:
# Joining the words of clean tweets
training_tweet_docs =[" ".join(tweet) for tweet,sentiment in train_docs]
# First 20 cleaned tweets of training data
training_tweet_docs[0:20]

['southwestair schedule morning day fact yes sure even flight one cancelled flightled',
 'southwestair see worker time time go beyond love fly guy thank',
 'united flew ord miami back great crew service leg thanks',
 'southwestair horse radish',
 'united flight ord delayed air force one last flight sbn min land',
 'united load u fly sardine knew pilot hour late flight incompetent beyond belief',
 'jetblue stock response delays frustrate poor cust serv amp told ppl wait amp come back',
 'jetblue nice hoping rack enough mile take trip seattle enjoy perfect latte city coffee',
 'united frankly bad customer service ever problems happen deal defines company never united',
 'southwestair yeah haha never one expensive much fun destinationdragons',
 'southwestair gt dca flight almost full people screw cancelled flightation united usairways cancelled flight',
 'jetblue easy way get ticket receipt get one check get one online thanks',
 'usairways love change lounge cheese veggie olive addition c

### **Cleaning up the testing data**

In [15]:
testing_tweets=df_test['text'].values

### **Tokenizing the each tweet**



In [16]:
testing_docs=[]
for i in range(len(testing_tweets)):
    testing_docs.append((word_tokenize(testing_tweets[i])))
testing_docs[0]

['@',
 'AmericanAir',
 'In',
 'car',
 'gng',
 'to',
 'DFW',
 '.',
 'Pulled',
 'over',
 '1hr',
 'ago',
 '-',
 'very',
 'icy',
 'roads',
 '.',
 'On-hold',
 'with',
 'AA',
 'since',
 '1hr',
 '.',
 'Ca',
 "n't",
 'reach',
 'arpt',
 'for',
 'AA2450',
 '.',
 'Wat',
 '2',
 'do',
 '?']

### **Cleaning up the tweets using clean_tweet()**

In [17]:
testing_docs =[clean_tweet(tweet) for tweet in testing_docs]
testing_docs[0],len(testing_docs)

(['americanair',
  'car',
  'gng',
  'dfw',
  'pulled',
  'ago',
  'icy',
  'road',
  'aa',
  'since',
  'ca',
  'reach',
  'arpt',
  'wat'],
 3660)

In [18]:
# Joining the words of clean tweets
testing_tweet_docs=[" ".join(tweet) for tweet in testing_docs]
# First 20 cleaned tweets of testing data
testing_tweet_docs[0:20]

['americanair car gng dfw pulled ago icy road aa since ca reach arpt wat',
 'americanair plane land identical bad condition grk accord metars',
 'southwestair ca believe many pay customer left high dry reason flight cancelled flightlations monday bdl wow',
 'usairways legitimately say would rather driven cross country flown us airways',
 'americanair still response aa great job guy',
 'united developer fly tmrw morn min layover earlier flight layover move',
 'usairways hello anyone',
 'usairways husainhaqqani husain u shld protest well one ur party member rehman malik delayed pia flight hour',
 'usairways likely flightaware say plane still durango depart',
 'americanair even give option hold say line busy plz try late flightr',
 'united announcement pre boarding address mobility disability require travel lot stuff preboard',
 'usairways really embarrass ask complimentary detailed http amp argue',
 'southwestair passport time trip could still fly photo id thingsishouldknow ifeeldumb',
 

## **Count Vectorizer, Tfid Vectorizer method**

In [19]:
# tfid_vec = TfidfVectorizer(max_features=1100)
# x_train = tfid_vec.fit_transform(training_tweet_docs)
# y_train = sentiments
# x_test = tfid_vec.transform(testing_tweet_docs)

count_vec = TfidfVectorizer(max_features=1100)
x_train = count_vec.fit_transform(training_tweet_docs)
y_train = sentiments
x_test = count_vec.transform(testing_tweet_docs)

In [20]:
# Applying Random Classifier and fitting with training data
rf = RandomForestClassifier(n_estimators=1500, n_jobs=-1)
rf.fit(x_train, y_train)

RandomForestClassifier(n_estimators=1500, n_jobs=-1)

In [21]:
rf.score(x_train,y_train)

0.9908014571948999

In [23]:
from sklearn.model_selection import cross_val_score
cross_val_score(rf,x_train , y_train).mean()

0.7585610200364299

In [24]:
# predicting on testing data and here y_pred = y_test (we want to predict directly the target so we cant calculate the score on it) 
y_test = rf.predict(x_test)

In [25]:
# Converting predictions into the csv file
np.savetxt("twitter_predictions.csv", y_test, fmt='%s')

In [26]:
from sklearn.svm import SVC

In [59]:
clf = SVC(kernel = 'linear',C=1000)
clf.fit(x_train,y_train)

SVC(C=1000, kernel='linear')

In [60]:
clf.score(x_train,y_train)

0.8773224043715847

In [None]:
cross_val_score(clf,x_train , y_train).mean()

In [43]:
y_test = clf.predict(x_test)

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
grid = {'C':[1,10,100,500,1000,5000],'gamma':[1e-3,5e-4,1e-4,5e-3]}
abc = GridSearchCV(clf,grid)
abc.fit(x_train,y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10, 100, 500, 1000, 5000],
                         'gamma': [0.001, 0.0005, 0.0001, 0.005]})

In [34]:
abc.best_estimator_

SVC(C=5000, gamma=0.0001)

In [44]:
np.savetxt("twitter_predictions.csv", y_test, fmt='%s')

In [56]:
from sklearn.naive_bayes import MultinomialNB
clf3 = MultinomialNB()
clf3.fit(x_train,y_train)

MultinomialNB()

In [57]:
clf3.score(x_train,y_train)

0.768943533697632

In [58]:
cross_val_score(clf3,x_train,y_train).mean()

0.7436247723132968

In [51]:
y_test = clf3.predict(x_test)

In [52]:
np.savetxt("twitter_predictions.csv", y_test, fmt='%s')