# Importing Libraries

In [8]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
import nltk
from wordcloud import WordCloud, STOPWORDS

In [11]:
data = pd.read_csv("Tweets.csv")

# Describing data

In [12]:
data.shape

(14640, 15)

In [13]:
data.describe()

Unnamed: 0,tweet_id,airline_sentiment_confidence,negativereason_confidence,retweet_count
count,14640.0,14640.0,10522.0,14640.0
mean,5.692184e+17,0.900169,0.638298,0.08265
std,779111200000000.0,0.16283,0.33044,0.745778
min,5.675883e+17,0.335,0.0,0.0
25%,5.685592e+17,0.6923,0.3606,0.0
50%,5.694779e+17,1.0,0.6706,0.0
75%,5.698905e+17,1.0,1.0,0.0
max,5.703106e+17,1.0,1.0,44.0


# Removing Everything except Text and Sentiment

In [14]:
data = data.loc[:,["text","airline_sentiment"]]

In [15]:
for k,tx in enumerate(data["text"]):
    li = " ".join([word for word in tx.split() if 'http' not in word and not word.startswith('@')and word != 'RT'])
    data.loc[k,"text"] = li

In [16]:
Y = data["airline_sentiment"].apply(lambda x: 0 if x == "negative"  else (1 if (x == "nuetral") else 2))

In [17]:
data["airline_sentiment"].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

# Text mining
1. Removing  Puntuation
2. Removing Symbols and Digits
2. Removing Stopword, 
3. Stemming/lemmatizing

In [18]:
punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''

for k,tx in enumerate(data["text"]):
    res = re.sub(r'[^\w\s]', '', tx)
    data.loc[k,"text"] = ''.join([i.lower() for i in res if not i.isdigit()])

In [19]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
from nltk.corpus import stopwords
stops = stopwords.words('english')

In [21]:
for k,tx in enumerate(data["text"]):
    li = " ".join([word for word in tx.split() if word not in stops])
    data.loc[k,"text"] = li

In [22]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
ltz = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [23]:
for k,tx in enumerate(data["text"]):
    li = " ".join([ltz.lemmatize(word) for word in tx.split()])
    data.loc[k,"text"] = li

In [24]:
X = data["text"]

# 

In [25]:
cv = CountVectorizer(lowercase=True, stop_words='english', min_df=2)
cvs = cv.fit_transform(X.values.astype('U'))
X_cv_df = pd.DataFrame(cvs.toarray(), columns=cv.get_feature_names())

In [26]:
X_cv_df.shape

(14640, 4974)

In [78]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_cv_df,Y,test_size = 0.1, random_state= 10)

In [28]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train,Y_train)
print("Train Accuracy : " + str(model.score(X_train,Y_train)), "Test Accuracy : " + str(model.score(X_test,Y_test)))

Train Accuracy : 0.9958257437765634 Test Accuracy : 0.7971311475409836


In [29]:
model = XGBClassifier(n_estimators = 150)
model.fit(X_train,Y_train)
print("Train Accuracy : " + str(model.score(X_train,Y_train)), "Test Accuracy : " + str(model.score(X_test,Y_test)))

Train Accuracy : 0.73816029143898 Test Accuracy : 0.7199453551912568


In [30]:
model = DecisionTreeClassifier()
model.fit(X_train,Y_train)
print("Train Accuracy : " + str(model.score(X_train,Y_train)), "Test Accuracy : " + str(model.score(X_test,Y_test)))

Train Accuracy : 0.9959016393442623 Test Accuracy : 0.7342896174863388


In [31]:
from keras.models import Sequential
from keras.layers import Dense

In [79]:
def onehot(arr):
    li = np.zeros((3,len(arr)))
    for i,k in enumerate(arr):
        hot = [0,0,0]
        # print(k)
        hot[k] = 1
        li[:,i] = hot
    return li
Y_train = onehot(Y_train)
Y_test = onehot(Y_test)

In [80]:
Y_test = Y_test.transpose()
Y_train = Y_train.transpose()

In [81]:
Y_train.shape

(13176, 3)

In [82]:
X_train = np.array(X_train)
X_train.shape

(13176, 4974)

In [83]:
import tensorflow as tf
from keras.layers import Dropout
from tensorflow.keras.callbacks import ModelCheckpoint

In [84]:
model = Sequential()
model.add(Dense(1000, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.9))
# model.add(Dense(300, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.7))
model.add(Dense(10,activation="relu"))
model.add(Dense(3,activation= "sigmoid"))
checkpoint = ModelCheckpoint('model_cv_nn.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')

In [85]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [86]:
history = model.fit(X_train, Y_train,validation_data = (X_test,Y_test),callbacks = [checkpoint],epochs=10, batch_size=64)

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.79577, saving model to model_cv_nn.h5
Epoch 2/10
Epoch 00002: val_accuracy improved from 0.79577 to 0.81762, saving model to model_cv_nn.h5
Epoch 3/10
Epoch 00003: val_accuracy improved from 0.81762 to 0.82377, saving model to model_cv_nn.h5
Epoch 4/10
Epoch 00004: val_accuracy improved from 0.82377 to 0.82650, saving model to model_cv_nn.h5
Epoch 5/10
Epoch 00005: val_accuracy improved from 0.82650 to 0.82787, saving model to model_cv_nn.h5
Epoch 6/10
Epoch 00006: val_accuracy improved from 0.82787 to 0.82923, saving model to model_cv_nn.h5
Epoch 7/10
Epoch 00007: val_accuracy did not improve from 0.82923
Epoch 8/10
Epoch 00008: val_accuracy did not improve from 0.82923
Epoch 9/10
Epoch 00009: val_accuracy did not improve from 0.82923
Epoch 10/10
Epoch 00010: val_accuracy did not improve from 0.82923


# Highest Test accuarcy 82.8% percent.

In [40]:
tfv = TfidfVectorizer(lowercase=True, stop_words='english', min_df=2)
tfvs = tfv.fit_transform(X.values.astype('U'))
X_tf_df = pd.DataFrame(tfvs.toarray(), columns=tfv.get_feature_names())

In [41]:
X_tf_df.describe()

Unnamed: 0,aa,aaba,aadfw,aadv,aadvantage,aafail,aal,aano,abandoned,abc,ability,able,aboard,abq,abroad,absolute,absolutely,absurd,abt,abused,abysmal,ac,accept,acceptable,accepted,accepting,access,accident,accidentally,accommodate,accommodated,accommodating,accommodation,accomplished,according,accordingly,account,accountability,acct,accts,...,yard,yay,yea,yeah,year,yearly,yell,yelled,yelling,yep,yes,yesso,yest,yesterday,yikes,yo,york,youd,youi,youll,young,younger,youre,yousuck,youth,youve,youyou,yow,yr,yuck,yuma,yummy,yup,yvonne,yvr,yyz,zero,zone,zoom,zurich
count,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,...,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0,14640.0
mean,0.00529,7.1e-05,5.1e-05,4.9e-05,0.000279,6.6e-05,7.4e-05,4.9e-05,6.1e-05,0.000232,0.000216,0.002821,0.000124,0.000271,7.3e-05,0.000669,0.000952,0.000427,0.000192,6.9e-05,0.000177,0.000242,0.000397,0.000664,0.000235,0.000304,0.000941,0.000197,0.000106,0.000454,6.2e-05,0.000234,0.000277,0.00011,0.000321,0.000137,0.001796,0.000193,0.000285,5.1e-05,...,5.8e-05,0.000393,0.000398,0.001189,0.003128,6.6e-05,8.5e-05,0.000237,0.00012,0.000515,0.006069,7.2e-05,0.000122,0.002524,0.0001,0.000497,0.000357,0.000485,8.2e-05,0.000694,0.000245,6e-05,0.003921,0.000134,8.8e-05,0.001441,0.000125,6.6e-05,0.001015,6.7e-05,0.000161,7.1e-05,0.000304,5.4e-05,0.000196,0.000352,0.000834,0.000262,5.4e-05,0.000104
std,0.042057,0.006084,0.004352,0.004223,0.010715,0.005687,0.006477,0.004223,0.005351,0.010827,0.010144,0.032248,0.007822,0.011102,0.006214,0.017728,0.022798,0.014624,0.008896,0.005986,0.009917,0.009842,0.012651,0.017067,0.010612,0.012545,0.018756,0.009941,0.006453,0.013371,0.005483,0.012012,0.011284,0.0078,0.011947,0.008598,0.025276,0.009599,0.010974,0.004346,...,0.004987,0.014824,0.014229,0.022452,0.033299,0.005693,0.005931,0.010254,0.007504,0.016134,0.05272,0.006194,0.007431,0.030511,0.009133,0.015062,0.012058,0.013637,0.007034,0.018078,0.009965,0.005111,0.038188,0.008112,0.006216,0.023532,0.01068,0.005638,0.018886,0.005717,0.009902,0.006098,0.012791,0.004618,0.008996,0.012512,0.017238,0.010058,0.004621,0.006399
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.811421,0.530481,0.372337,0.361347,0.493033,0.516293,0.658825,0.361347,0.55782,0.613946,0.645185,0.735145,0.683375,0.561279,0.560837,0.712762,1.0,0.696412,0.524695,0.588968,0.744023,0.489461,0.543682,0.693523,0.795147,0.683032,0.572444,0.700831,0.428936,0.482266,0.565355,0.840904,0.559511,0.649497,0.580662,0.740448,0.560789,0.542699,0.482168,0.385708,...,0.455103,0.748017,0.738265,0.824041,0.830116,0.503881,0.438937,0.526999,0.622738,0.846417,1.0,0.529946,0.511915,0.795425,1.0,0.838607,0.519154,0.546563,0.611474,0.703829,0.463303,0.456391,0.722792,0.556448,0.512377,0.592361,0.91378,0.524889,0.536196,0.510378,0.760025,0.534505,0.762366,0.40854,0.450055,0.600728,0.478956,0.454655,0.399599,0.462286


In [92]:
X_train, X_test, Y_train, Y_test = train_test_split(X_tf_df,Y,test_size = 0.1, random_state= 10)

In [43]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train,Y_train)
print("Train Accuracy : " + str(model.score(X_train,Y_train)), "Test Accuracy : " + str(model.score(X_test,Y_test)))

Train Accuracy : 0.9959016393442623 Test Accuracy : 0.8073770491803278


In [44]:
model = XGBClassifier(n_estimators = 150)
model.fit(X_train,Y_train)
print("Train Accuracy : " + str(model.score(X_train,Y_train)), "Test Accuracy : " + str(model.score(X_test,Y_test)))

Train Accuracy : 0.7491651487553127 Test Accuracy : 0.7288251366120219


In [45]:
model = DecisionTreeClassifier()
model.fit(X_train,Y_train)
print("Train Accuracy : " + str(model.score(X_train,Y_train)), "Test Accuracy : " + str(model.score(X_test,Y_test)))

Train Accuracy : 0.9959016393442623 Test Accuracy : 0.7527322404371585


In [93]:
def onehot(arr):
    li = np.zeros((3,len(arr)))
    for i,k in enumerate(arr):
        hot = [0,0,0]
        # print(k)
        hot[k] = 1
        li[:,i] = hot
    return li
Y_train = onehot(Y_train)
Y_test = onehot(Y_test)

In [94]:
Y_test = Y_test.transpose()
Y_train = Y_train.transpose()

In [95]:
Y_test.shape

(1464, 3)

In [96]:
X_train = np.array(X_train)
X_train.shape

(13176, 4974)

In [97]:
model = Sequential()
model.add(Dense(1000, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.9))
# model.add(Dense(300, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.7))
model.add(Dense(10,activation="relu"))
model.add(Dense(3,activation= "sigmoid"))
checkpoint = ModelCheckpoint('model_tf_nn.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')

In [98]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [99]:
history = model.fit(X_train, Y_train,validation_data = (X_test,Y_test),callbacks=[checkpoint],epochs=20, batch_size=64)

Epoch 1/20
Epoch 00001: val_accuracy improved from -inf to 0.61612, saving model to model_tf_nn.h5
Epoch 2/20
Epoch 00002: val_accuracy improved from 0.61612 to 0.82377, saving model to model_tf_nn.h5
Epoch 3/20
Epoch 00003: val_accuracy improved from 0.82377 to 0.82445, saving model to model_tf_nn.h5
Epoch 4/20
Epoch 00004: val_accuracy did not improve from 0.82445
Epoch 5/20
Epoch 00005: val_accuracy improved from 0.82445 to 0.83060, saving model to model_tf_nn.h5
Epoch 6/20
Epoch 00006: val_accuracy improved from 0.83060 to 0.83743, saving model to model_tf_nn.h5
Epoch 7/20
Epoch 00007: val_accuracy did not improve from 0.83743
Epoch 8/20
Epoch 00008: val_accuracy did not improve from 0.83743
Epoch 9/20
Epoch 00009: val_accuracy did not improve from 0.83743
Epoch 10/20
Epoch 00010: val_accuracy did not improve from 0.83743
Epoch 11/20
Epoch 00011: val_accuracy did not improve from 0.83743
Epoch 12/20
Epoch 00012: val_accuracy did not improve from 0.83743
Epoch 13/20
Epoch 00013: val

# Highest Accuracy Acheived 83.743

Lets Try PCA for the above vectors

In [None]:
from sklearn.decomposition import PCA
decomp = PCA(n_components=400)
decomp.fit(X_tf_df)
X_train = decomp.transform(X_train)
X_test =  decomp.transform(X_test)

In [None]:
model = Sequential()
model.add(Dense(200, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
# model.add(Dense(300, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(10,activation="relu"))
model.add(Dense(3,activation= "sigmoid"))

In [55]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [56]:
history = model.fit(X_train, Y_train,validation_data = (X_test,Y_test), epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


This alos does not help much. Let's try our new (self trained)word2vec embeddings

#### For lstm and word embeddings based approaches please refer to the next .ipynb file



