# Project - Airline Sentiment Analysis
---------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------

# Importing Libraries

In [5]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords,wordnet
import string
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier

# Dataset: Airline sentiment

A dataset for US airlines comments analysis,Tweets analysis on Kaggle (<a href="https://www.kaggle.com/datasets/welkin10/airline-sentiment">See dataset page</a>)

### About Dataset
#### Context:
    - This is US airlines data which contain comments of passengers on basis of service provided by airlines.
--------------------------------------------------------------------------------
#### Inspiration:
    -you can use it for sentiment analysis .
---------------------------------------------------------------------------------

# Importing Dataset

In [6]:
dataset=pd.read_csv('Dataset/airline data.csv')
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [7]:
print(len(dataset.columns))
dataset.columns

15


Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [8]:
text=dataset['text'].values
text=[word_tokenize(doc) for doc in text]

In [9]:
text[0]

['@', 'VirginAmerica', 'What', '@', 'dhepburn', 'said', '.']

In [10]:
for i in ['airline_sentiment']:
    print(i,dataset[i].unique())

airline_sentiment ['neutral' 'positive' 'negative']


In [11]:
sentiment=dataset['airline_sentiment'].values #'neutral' 'positive' 'negative'

In [12]:
dataset.shape

(14640, 15)

In [13]:
documents=[(text[i],sentiment[i]) for i in range(dataset.shape[0])]

In [14]:
documents[0]

(['@', 'VirginAmerica', 'What', '@', 'dhepburn', 'said', '.'], 'neutral')

# Dataset Preprocessing
1. TOKENIZING
2. LEMMATIZING
3. REMOVING STOPWORDS
4. PUNCTUATIONS

In [15]:
lemmatizer=WordNetLemmatizer()

In [16]:
def get_simple_pos(tag):   
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
stops=set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [18]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [19]:
document = [(clean_review(doc), category) for doc, category in documents]

In [20]:
document[0]

(['virginamerica', 'dhepburn', 'say'], 'neutral')

In [21]:
print("The clean dataset is \n")
for i in document:
    print(i)

The clean dataset is 

(['virginamerica', 'dhepburn', 'say'], 'neutral')
(['virginamerica', 'plus', "'ve", 'add', 'commercial', 'experience', '...', 'tacky'], 'positive')
(['virginamerica', "n't", 'today', '...', 'must', 'mean', 'need', 'take', 'another', 'trip'], 'neutral')
(['virginamerica', "'s", 'really', 'aggressive', 'blast', 'obnoxious', '``', 'entertainment', "''", 'guest', 'face', 'amp', 'little', 'recourse'], 'negative')
(['virginamerica', "'s", 'really', 'big', 'bad', 'thing'], 'negative')
(['virginamerica', 'seriously', 'would', 'pay', '30', 'flight', 'seat', "n't", 'play', "'s", 'really', 'bad', 'thing', 'fly', 'va'], 'negative')
(['virginamerica', 'yes', 'nearly', 'every', 'time', 'fly', 'vx', '“', 'ear', 'worm', '”', '’', 'go', 'away'], 'positive')
(['virginamerica', 'really', 'miss', 'prime', 'opportunity', 'men', 'without', 'hats', 'parody', 'http', '//t.co/mwpg7grezp'], 'neutral')
(['virginamerica', 'well', "didn't…but", '-d'], 'positive')
(['virginamerica', 'amaze', 

(['united', 'wifi', "n't", 'work', 'onboard.alerted', 'attendant', 'socket.you', 'sent', 'hotel', '24', 'hour', '7', 'vouchers', 'wifi', 'hotel'], 'negative')
(['united', 'flight', '1', 'luck', 'standby'], 'negative')
(['united', 'plan', 'ipad', 'app', 'iphone', 'app', 'great', 'ipad', 'flip', 'ipad', 'beta', 'tks'], 'neutral')
(['united', 'try', 'hung', 'twice', 'speak', 'someone', 'put', 'hold', '45', 'minute', 'resolution'], 'neutral')
(['united', "'m", 'sure', '``', 'next', 'time', "''", '...'], 'negative')
(['united', 'weight', 'restriction', "'we", "'ll", 'try', 'get', 'many', 'plane'], 'negative')
(['united', 'thank', 'person', 'houston', 'could', 'get', 'flight', 'rout', 'newark', 'nj', "'m", 'go', 'sfo'], 'positive')
(['united', 'bish'], 'negative')
(['united', 'thanks', 'good', 'know'], 'neutral')
(['united', 'okay', 'thanks'], 'neutral')
(['united', 'conference', 'begin', '3', 'hour', 'night', 'due', 'delay', 'still', 'wait', 'talk', 'someone', 'lose', 'luggage'], 'negative'

(['united', 'one', 'interview', 'last', 'person', 'mean'], 'negative')
(['united', 'flown', '10', 'time', 'last', '13', 'day', '8', '10', 'flight', 'delayed', 'huge', 'mistake', 'part', "n't", 'fly', 'united'], 'negative')
(['united', 'receive', 'notification', 'in-flight', 'wi-fi', 'ua863', 'flysfo', 'sydneyairport', 'amazing'], 'positive')
(['united', 'yes', 'take', 'two', 'day', 'get', 'bag', 'deliver', 'missed', 'full', 'day', 'skiing'], 'negative')
(['united', 'give', 'recognition', 'deserves'], 'positive')
(['united', "'s", 'type', 'person', 'make', 'customer', 'day', 'fly', '100+', 'time', 'year', 'amp', "'s", 'one', 'top', 'flight', 'attendant', "'ve"], 'positive')
(['united', 'make', 'remove', '1', 'pound', 'checked', 'bag', 'otherwise', 'would', 'charge', '200', 'loyal', 'flier', '10', 'year', 'time', 'change'], 'negative')
(['united', 'sit', 'plane', '3:15', 'unreal', '2+', 'day', 'nightmare', 'end'], 'negative')
(['united', 'suck', 'southwestair', "'re", 'best'], 'positive'

(['southwestair', 'prove'], 'negative')
(['southwestair', 'travel', 'agent', 'darrel', 'love', 'field', 'host', 'paper', 'airplane', 'contest', 'entertain', 'child', 'awesome', 'see'], 'positive')
(['southwestair', 'big', 'kudos', 'staff', 'today', 'dallas', 'love', 'field', 'lift', 'everyone', "'s", 'spirit', 'today', 'delay', 'cancelled', 'flightlations'], 'positive')
(['southwestair', "'s", 'fun', 'delay', 'nashville', 'las', 'vegas', 'crew', 'gate', 'c9', 'desk', 'awesome', 'patience', 'luvswa'], 'positive')
(['southwestair', 'think', '100', 'voucher', 'make', 'spending', '4', 'hr', 'plane', 'land', 'airport', 'take', 'really'], 'negative')
(['never', 'get', 'strip', 'fast', 'stoked', 'special', 'imaginedragons', 'show', 'tonight', 'thx', 'southwestair', 'http', '//t.co/topqmvqnjp'], 'positive')
(['southwestair', 'problem', 'apology', "n't", 'help', 'results', 'matter', "'s", 'prove', 'impossible', 'find', 'anyone', 'interested'], 'negative')
(['southwestair', 'fare', 'give', 'clue

(['southwestair', 'want', 'southwest', 'know', "n't", 'think', "'re", 'great', 'use', 'anymore', 'nothing', 'look'], 'negative')
(['southwestair', 'love_dragonss', 'lauren', 'im', 'screaming'], 'positive')
(['southwestair', 'love_dragonss', 'lauren', 'omg', 'im', 'dead', 'im', 'happy', 'yes', 'yes'], 'positive')
(['southwestair', "'re", 'best'], 'positive')
(['southwestair', 'customer', 'relation', 'line', 'busy', 'signal', 'call', 'normal', 'line', 'service', 'rep', 'thought', 'dumb', 'know'], 'negative')
(['southwestair', "'ve", 'never', 'met', 'favorite', 'band', 'would', 'sooo', 'amaze', 'win', 'destination', 'dragon'], 'neutral')
(['southwestair', 'love_dragonss', 'lauren', 'omg', 'best', 'airline', 'ever'], 'positive')
(['southwestair', "n't", 'see', 'travel', 'compete', 'unused', 'fund', 'expiration', 'date', 'hidden', 'fine', 'print', 'never', 'saw'], 'negative')
(['southwestair', 'love_dragonss', 'ahhhh', 'yes', 'lauren'], 'neutral')
(['southwestair', 'love_dragonss', 'holy', 

(['well', 'let', 'see', 'could', 'pay', 'negroni', 'bitcoin', 'jetblue'], 'negative')
(['jetblue', 'know', 'schedule', 'dec', '2015', 'release'], 'neutral')
(['jetblue', 'direct', 'flight', 'bos', 'st', 'lucia', 'antigua'], 'neutral')
(['jetblue', 'okay', 'thanks', 'hope', 'fly', 'guy', 'soon'], 'positive')
(['jetblue', 'afternoon', 'flight', 'go', 'bqn', 'jfk', 'seem', 'find', 'early', 'morning', 'flight'], 'neutral')
(['jetblue', "'m", 'fly', 'airline', 'lga', '😷'], 'neutral')
(['jetblue', 'love', 'natural', 'beefjerky', 'snacks😉'], 'positive')
(['jetblue', 'thanks', 'reply', 'flight', '1572'], 'positive')
(['jetblue', 'must', 'send', 'note', 'good', 'word'], 'positive')
(['jetblue', 'captain', '``', 'take', 'lot', 'muscle', 'frown', 'smile', "y'all", 'ready', 'go', 'flyin', "''", 'mean', '...'], 'neutral')
(['jetblue', 'ca', "n't", 'seem', 'dm', 'guy', '..', 'jdhadp'], 'negative')
(['jetblue', 'love', 'capt', 'joe', 'flight', 'bos', 'sfo', '633', 'fun'], 'positive')
(['jetblue', 'tw

(['usairways', 'hi', 'lose', 'father', "'s", 'wheelchair', 'every', 'time', 'call', 'get', 'voicemail', 'dad', 'need', 'go', 'home', 'please', 'dm'], 'negative')
(['usairways', '4', 'segment', '4/4', 'delayed', 'gnv', 'gt', 'ctl', 'ctl', 'gt', 'jan', 'jan', 'gt', 'ctl', 'ctl', 'gt', 'gnv', 'year', 'fly', 'guy', 'way', 'go'], 'negative')
(['usairways', 'charge', '200', 'take', 'flight', 'lol', 'last', 'time', 'fly'], 'negative')
(['usairways', 'yes', 'say', 'bag', 'deliver', 'local', 'number', 'say', 'open', 'asked', 'main', 'cust', 'service', 'get', 'hung'], 'negative')
(['usairways', 'row', '16', 'flight', '634', 'today', "'re", 'look', 'specific', 'run', 'home', 'depot', 'wd40', 'need'], 'negative')
(['usairways', 'seriously', 'buy', 'wd40', 'a319', 'operating', 'flight', '634', 'geg', 'phoenix', 'every', 'seat', 'squeak', 'w', 'every', 'shift', 'still', 'ground'], 'negative')
(['usairways', 'would', 'happily', 'wait', 'terminal', 'near', 'food', 'restroom', 'non', 'irritated', 'pass

(['usairways', "'ve", 'call', 'resolve', 'dividend', 'mile', 'issue', 'week', 'avail', 'please', 'advise', 'asap'], 'negative')
(['usairways', 'finally', 'answer', 'rude', "n't", 'help'], 'negative')
(['usairways', 'americanair', "n't", 'leave'], 'neutral')
(['usairways', 'success', 'make', 'flight', 'please', 'thank', 'crew', '556', 'great', 'time', 'recovery'], 'positive')
(['usairways', '2d', '3d', 'emboss', 'badge', 'patch', 'superior', 'one', 'currently', 'use', 'http', '//t.co/3fq3xelbon'], 'neutral')
(['usairways', 'far', 'good', 'week', 'sav', 'clt', 'board', 'departure', 'time', 'first', 'time', '4', 'week', 'perhaps', 'time', 'departure'], 'positive')
(['usairways', 'accurate', 'safety', 'guide', 'show', 'mp3', 'player', 'usage', 'takeoff', 'land', 'http', '//t.co/nvk3irg4kp'], 'neutral')
(['usairways', "'ve", 'hold', '36', 'min', 'counting', 'cause', 'charge', 'credit', 'card', 'bag', 'provide', 'receipt', 'say', 'error', 'awful'], 'negative')
(['usairways', 'party', '4', 'b

(['americanair', 'hi', 'flight', 'dallas', 'cancelled', 'flightled', 'go', 'la', 'u', 'pls', 'help', 'rebook'], 'negative')
(['americanair', '2nd', 'time', '4', 'day', 'flight', 'delayed', 'gate', 'agent', "n't", 'say', 'anything', 'thanks', 'memory', 'neveragain'], 'negative')
(['americanair', 'thanks', "'s", 'tell', 'go', 'airport', 'check', 'agent-what', "'s", 'http', '//t.co/pfsenjk5pw'], 'neutral')
(['americanair', "'s", 'best', 'number', 'use'], 'neutral')
(['americanair', 'call', 'cue', '11', 'hour', 'call', 'yet', 'wife', 'cue', '9', 'hour', 'got', 'call', 'hung'], 'negative')
(['americanair', 'phone', '47', 'minute', '...', 'say', 'would', '10', '...', 'advice'], 'negative')
(['americanair', 'chicago', 'see', 'seat', '6a', 'aa', '1620', 'far', 'great', 'ride', 'pdx', 'http', '//t.co/x4rsvagijn'], 'positive')
(['americanair', 'thanks', 'delivery', 'status'], 'positive')
(['americanair', "n't", 'believe', "'s", 'acceptable', 'ticket', 'change', 'check', 'time', 'amp', 'notify', 

(['americanair', 'talk', 'agent', 'keep', 'get', 'hung', 'crazy'], 'negative')
(['americanair', 'cancelled', 'flights', 'flight', "n't", 'send', 'email', 'text', 'call', "'m", 'strand', 'louisville'], 'negative')
(['americanair', 'hi', 'cancelled', 'flightled', 'flight', 'back', 'us', 'wo', "n't", 'take', 'call', 'suppose'], 'negative')
(['americanair', 'get', 'rebooked', 'u', 'airway', 'guy', 'flight', 'get', 'wo', "n't", 'work', 'help', 'would', 'appreciate'], 'negative')
(['americanair', 'hold', '20', 'min', 'use', 'call', 'back', 'service', 'agent', 'call', 'put', 'hold', 'forever-anyone'], 'negative')
(['americanair', 'upgrade', 'cash', 'main', 'cabin', 'extra', 'buying', 'aadvantage', 'mileage', 'saver', 'use', 'mile', 'ca', "n't", 'seem', '...'], 'neutral')
(['americanair', 'americanair', 'expect'], 'negative')
(['americanair', 'flight', 'tomorrow', 'cancelled', 'flighted', 'ca', "n't", 'anything', 'online', 'ca', "n't", 'get', 'phone', 'help'], 'negative')
(['americanair', 'tel

In [22]:
categories = [category for document, category in document]

In [23]:
categories[:10]

['neutral',
 'positive',
 'neutral',
 'negative',
 'negative',
 'negative',
 'positive',
 'neutral',
 'positive',
 'positive']

In [24]:
text_documents = [" ".join(document) for document, category in document]

In [25]:
text_documents[0]

'virginamerica dhepburn say'

# SPLITTING THE DATASET

In [71]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_documents, categories,random_state=0)

# Vectorization to get the maximum number of frequency words

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
count_vect=TfidfVectorizer(max_features=5000, max_df=0.8, min_df=0.001)
X_train_features=count_vect.fit_transform(X_train)
X_test_features=count_vect.transform(X_test)

# Helper funcation

In [75]:
def eval_model(model,X_train,y_train,X_test,y_test):
    model.fit(X_train,y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_val, y_pred_test)
    print(model.__class__.__name__)
    print('acc train:', acc_train)
    print('acc val:', acc_test)
    print('----------------------------')
    
    return(acc_train,acc_test)

# USING Svm CLASSIFIER

In [76]:
clf = SVC() #c=1.0
acc_tarin1,acc_test1=eval_model(clf,X_train_features,y_train,X_test_features,y_test)

SVC
acc train: 0.9285063752276868
acc val: 0.7836065573770492
----------------------------


# USING RANDOM FOREST CLASSIFIER 

In [79]:
clf1=RandomForestClassifier(n_estimators=2000, n_jobs=-1)
acc_tarin2,acc_test2=eval_model(clf1,X_train_features,y_train,X_test_features,y_test)

RandomForestClassifier
acc train: 0.9911657559198542
acc val: 0.7508196721311475
----------------------------
