In [None]:
import nltk                                
import matplotlib.pyplot as plt            
import pandas as pd
import re                                  
from nltk.corpus import stopwords          
nltk.download('stopwords')
#from nltk.tokenize import word_tokenize
#nltk.download('punkt')
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split

from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
#Data Importation
dataset = pd.read_csv('Tweets_US_airline.csv')
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,5.70306e+17,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,24-02-2015 11:35,,Eastern Time (US & Canada)
1,5.70301e+17,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,24-02-2015 11:15,,Pacific Time (US & Canada)
2,5.70301e+17,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,24-02-2015 11:15,Lets Play,Central Time (US & Canada)
3,5.70301e+17,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,24-02-2015 11:15,,Pacific Time (US & Canada)
4,5.70301e+17,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,24-02-2015 11:14,,Pacific Time (US & Canada)


In [5]:
#Data shape
dim=dataset.shape
print(dim)

(14640, 15)


In [127]:
#Data Type
types=dataset.dtypes
print(types)

tweet_id                        float64
airline_sentiment                object
airline_sentiment_confidence    float64
negativereason                   object
negativereason_confidence       float64
airline                          object
airline_sentiment_gold           object
name                             object
negativereason_gold              object
retweet_count                     int64
text                             object
tweet_coord                      object
tweet_created                    object
tweet_location                   object
user_timezone                    object
dtype: object


In [128]:
#No of null values
null=dataset.isna().sum()
print(null)

tweet_id                            0
airline_sentiment                   0
airline_sentiment_confidence        0
negativereason                   5462
negativereason_confidence        4118
airline                             0
airline_sentiment_gold          14600
name                                0
negativereason_gold             14608
retweet_count                       0
text                                0
tweet_coord                     13621
tweet_created                       0
tweet_location                   4733
user_timezone                    4820
dtype: int64


In [5]:
#Type and Count of Sentiments
dataset["airline_sentiment"].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [5]:
data = dataset[(dataset.airline_sentiment == "positive") | (dataset.airline_sentiment == "negative")]
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
1,5.70301e+17,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,24-02-2015 11:15,,Pacific Time (US & Canada)
3,5.70301e+17,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,24-02-2015 11:15,,Pacific Time (US & Canada)
4,5.70301e+17,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,24-02-2015 11:14,,Pacific Time (US & Canada)
5,5.70301e+17,negative,1.0,Can't Tell,0.6842,Virgin America,,jnardino,,0,@VirginAmerica seriously would pay $30 a fligh...,,24-02-2015 11:14,,Pacific Time (US & Canada)
6,5.70301e+17,positive,0.6745,,0.0,Virgin America,,cjmcginnis,,0,"@VirginAmerica yes, nearly every time I fly VX...",,24-02-2015 11:13,San Francisco CA,Pacific Time (US & Canada)


In [6]:
#Sentiment Class conversion
data['airline_sentiment'].replace(('positive', 'negative'), (1, 0), inplace=True)
data['airline_sentiment'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


0    9178
1    2363
Name: airline_sentiment, dtype: int64

In [7]:
#Down Sampling
positive_tweet = data[data.airline_sentiment == 1] 
negative_tweet = data[data.airline_sentiment == 0]

negative_downsampled = resample(negative_tweet, replace=True, n_samples=2363, random_state=123)

In [8]:
#Data Balancing
balanced_data = pd.concat([negative_downsampled, positive_tweet])
balanced_data.airline_sentiment.value_counts()

1    2363
0    2363
Name: airline_sentiment, dtype: int64

In [9]:
#Text Pre-Processing

stemming = PorterStemmer()

def pre_process(tweet):
    new_tweet = re.sub("[^a-zA-Z]", " ",tweet) # Remove all the special characters
    words = new_tweet.lower().split() #Convert all letters to lower case      
    words = [stemming.stem(word) for word in words] # Porter stemming
    return (" ".join(words)) #Join all words back to text

balanced_data['T1']=balanced_data['text'].apply(pre_process)
balanced_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,T1
5813,5.68578e+17,0,1.0,Cancelled Flight,0.6547,Southwest,,looselydraped,,0,@SouthwestAir is seriously THE WORST. I don't ...,,19-02-2015 17:09,,Central Time (US & Canada),southwestair is serious the worst i don t reme...
2119,5.6936e+17,0,1.0,Lost Luggage,1.0,United,,mjfredricks259,,0,@united my luggage is set to go to DCA however...,,21-02-2015 20:55,,,unit my luggag is set to go to dca howev i am ...
9329,5.69984e+17,0,1.0,Customer Service Issue,1.0,US Airways,,acnewsguy,,0,@USAirways She was put on hold on that number ...,,23-02-2015 14:14,"New Haven, CT",Mountain Time (US & Canada),usairway she wa put on hold on that number for...
12764,5.70033e+17,0,1.0,Late Flight,1.0,American,,LunaStarwind,,0,@AmericanAir has no idea what they're doing. D...,,23-02-2015 17:31,"Menomonee Falls, WI, USA",Central Time (US & Canada),americanair ha no idea what they re do delay f...
12155,5.70275e+17,0,1.0,Lost Luggage,1.0,American,,paintbranch1398,,0,@AmericanAir this delayed bag was for my frien...,,24-02-2015 09:32,,,americanair thi delay bag wa for my friend lis...


In [10]:
#Removing stop words

stop = stopwords.words("english")
balanced_data['TWS'] = balanced_data['T1'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print(stop)
balanced_data.head()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,T1,TWS
5813,5.68578e+17,0,1.0,Cancelled Flight,0.6547,Southwest,,looselydraped,,0,@SouthwestAir is seriously THE WORST. I don't ...,,19-02-2015 17:09,,Central Time (US & Canada),southwestair is serious the worst i don t reme...,southwestair serious worst rememb last time so...
2119,5.6936e+17,0,1.0,Lost Luggage,1.0,United,,mjfredricks259,,0,@united my luggage is set to go to DCA however...,,21-02-2015 20:55,,,unit my luggag is set to go to dca howev i am ...,unit luggag set go dca howev rerout dull curre...
9329,5.69984e+17,0,1.0,Customer Service Issue,1.0,US Airways,,acnewsguy,,0,@USAirways She was put on hold on that number ...,,23-02-2015 14:14,"New Haven, CT",Mountain Time (US & Canada),usairway she wa put on hold on that number for...,usairway wa put hold number loooooong time hel...
12764,5.70033e+17,0,1.0,Late Flight,1.0,American,,LunaStarwind,,0,@AmericanAir has no idea what they're doing. D...,,23-02-2015 17:31,"Menomonee Falls, WI, USA",Central Time (US & Canada),americanair ha no idea what they re do delay f...,americanair ha idea delay flight pm pm delay a...
12155,5.70275e+17,0,1.0,Lost Luggage,1.0,American,,paintbranch1398,,0,@AmericanAir this delayed bag was for my frien...,,24-02-2015 09:32,,,americanair thi delay bag wa for my friend lis...,americanair thi delay bag wa friend lisa pafe ...


In [11]:
#forming the feature & label variables
tweets = balanced_data['TWS'].values.tolist()
sentiments = balanced_data['airline_sentiment'].values.tolist()

In [12]:
#splitting the data into 80 and 20 split
x_train, x_test, y_train, y_test = train_test_split(tweets, sentiments, test_size=0.3, 
                                                    random_state=100, shuffle=True)

print(f'Number of training examples: {len(x_train)}')
print(f'Number of testing examples: {len(x_test)}')

Number of training examples: 3308
Number of testing examples: 1418


In [182]:
#def tokenization(text):
 #   text = text.lower()
  #  return word_tokenize(text)

# Tokenize training text into tokens
#tokenized_tweet = []
#for i in range(0, len(x_train)):
 #   tokenized_tweet.append(tokenization(x_train[i]))

#train_x = tokenized_tweet

# Tokenize testing text into tokens
#tokenized_tweet = []
#for i in range(0, len(x_test)):
#    tokenized_tweet.append(tokenization(x_test[i]))

#test_x = tokenized_tweet
#tokenized train & test data
#print(train_x[0], train_x[1])
#print(test_x[0])

In [13]:
# TF-IDF 
vectorizer = TfidfVectorizer()
train_tfidf_model = vectorizer.fit_transform(x_train)
test_tfidf_model = vectorizer.transform(x_test)

train_tfidf = pd.DataFrame(train_tfidf_model.toarray(), columns=vectorizer.get_feature_names()) 
train_tfidf

Unnamed: 0,aa,aadv,aafail,aaron,aarp,aateam,aavvoreph,aay,ab,abc,...,zgoqoxjbqi,zipper,zkatcher,zkoe,zouowgv,zpjr,zq,zsvzurlw,zut,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
print(vectorizer.get_feature_names())

['aa', 'aadv', 'aafail', 'aaron', 'aarp', 'aateam', 'aavvoreph', 'aay', 'ab', 'abc', 'abcnetwork', 'abi', 'abil', 'abl', 'aboard', 'abov', 'abq', 'absolut', 'absurd', 'abt', 'abus', 'abysm', 'ac', 'accept', 'access', 'accid', 'accomid', 'accommod', 'accompani', 'accomplish', 'accordingli', 'account', 'accru', 'acct', 'accur', 'acfqcdq', 'aci', 'acknowledg', 'acosta', 'across', 'act', 'action', 'activ', 'actual', 'ad', 'add', 'addit', 'address', 'addtl', 'adjust', 'admin', 'admir', 'admit', 'ador', 'advanc', 'advertis', 'advic', 'advis', 'advisori', 'aerojobmarket', 'aesthet', 'affect', 'afford', 'afternoon', 'age', 'agenc', 'agent', 'agfd', 'aggrav', 'ago', 'agoodlif', 'agre', 'agt', 'ah', 'aha', 'ahah', 'ahead', 'ahem', 'ahhhh', 'ahlxhhkiyn', 'ahold', 'ai', 'air', 'airbu', 'aircanada', 'aircraft', 'airfar', 'airlin', 'airlinegav', 'airnzusa', 'airplan', 'airplanemodewason', 'airport', 'airtahitinui', 'airway', 'aiyc', 'aka', 'akron', 'al', 'alabama', 'alaska', 'alavera', 'albani', 'al

In [56]:
cls = [SVC(), GradientBoostingClassifier(), MultinomialNB(), DecisionTreeClassifier(), RandomForestClassifier(n_estimators=200), LogisticRegression()]

cls_name = []

In [66]:
#Accuracy and Classification report

lbl_actual = y_test
i = 0
accuracy = []
for cl in cls:
    model = cl.fit(train_tfidf_model, y_train)
    lbl_pred = model.predict(test_tfidf_model)
    a = (100*accuracy_score(lbl_pred, lbl_actual))
    a = round(a,2)
    accuracy.append(a)
    cls_name.append(cl.__class__.__name__)
    print ("{}  Accuracy Score : {}%".format(cls_name[i],a))
    print ( classification_report(lbl_pred, lbl_actual))
    i +=1

SVC  Accuracy Score : 89.28%
              precision    recall  f1-score   support

           0       0.91      0.88      0.90       744
           1       0.87      0.91      0.89       674

    accuracy                           0.89      1418
   macro avg       0.89      0.89      0.89      1418
weighted avg       0.89      0.89      0.89      1418

GradientBoostingClassifier  Accuracy Score : 84.06%
              precision    recall  f1-score   support

           0       0.91      0.80      0.85       818
           1       0.77      0.90      0.83       600

    accuracy                           0.84      1418
   macro avg       0.84      0.85      0.84      1418
weighted avg       0.85      0.84      0.84      1418

MultinomialNB  Accuracy Score : 86.95%
              precision    recall  f1-score   support

           0       0.89      0.86      0.87       747
           1       0.85      0.89      0.87       671

    accuracy                           0.87      1418
   macro

In [58]:
#Classification Error

lbl_actual = y_test
i = 0
error = []
for cl in cls:
    model = cl.fit(train_tfidf_model, y_train)
    lbl_pred = model.predict(test_tfidf_model)
    a = (100*accuracy_score(lbl_pred, lbl_actual))
    a = round(a,2)
    b = 100 - a
    error.append(b)
    cls_name.append(cl.__class__.__name__)
    print ("{}  Classification Error : {}%".format(cls_name[i],b))
    i +=1

SVC  Classification Error : 10.719999999999999%
GradientBoostingClassifier  Classification Error : 15.510000000000005%
MultinomialNB  Classification Error : 13.049999999999997%
DecisionTreeClassifier  Classification Error : 17.980000000000004%
RandomForestClassifier  Classification Error : 13.329999999999998%
LogisticRegression  Classification Error : 11.920000000000002%


In [59]:
#AUC Score

lbl_actual = y_test
i = 0
model_auc = []
for cl in cls:
    model = cl.fit(train_tfidf_model, y_train)
    lbl_pred = model.predict(test_tfidf_model)
    a = (100*roc_auc_score(lbl_pred, lbl_actual))
    a = round(a,2)
    model_auc.append(a)
    cls_name.append(cl.__class__.__name__)
    print ("{}  AUC Score : {}%".format(cls_name[i],a))
    i +=1

SVC  AUC Score : 89.35%
GradientBoostingClassifier  AUC Score : 84.81%
MultinomialNB  AUC Score : 87.03%
DecisionTreeClassifier  AUC Score : 81.72%
RandomForestClassifier  AUC Score : 85.69%
LogisticRegression  AUC Score : 88.16%


In [60]:
*963. #Specificity from confusion matrix 

lbl_actual = y_test
i = 0
cm = []
for cl in cls:
    model = cl.fit(train_tfidf_model, y_train)
    lbl_pred = model.predict(test_tfidf_model)
    a = confusion_matrix(lbl_pred, lbl_actual)
    spe = (100*a[1,1]/(a[1,0]+a[1qzxcfffffffffffffffv++++++vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv+63663.0tt,1]))
    cm.append(spe)
    cls_name.append(cl.__class__.__name__)
    print ("{}  Specificity : {}%".format(cls_name[i],spe))
    i +=1

SVC  Specificity : 90.80118694362018%
GradientBoostingClassifier  Specificity : 89.45634266886326%
MultinomialNB  Specificity : 88.52459016393442%
DecisionTreeClassifier  Specificity : 81.12947658402204%
RandomForestClassifier  Specificity : 86.38968481375358%
LogisticRegression  Specificity : 89.71684053651266%
