In [1]:
from sklearn.model_selection import train_test_split

In [2]:
# -*- coding: utf-8 -*-
"""Hackathon.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/19Y6nCWGiR6U0cKHUCMEo1XbPKbVr2xRx

# Importing Libraries
"""

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from wordcloud import WordCloud, STOPWORDS

data = pd.read_csv("Tweets.csv")

data.shape

data.describe()

data = data.loc[:,["text","airline_sentiment"]]

for k,tx in enumerate(data["text"]):
    li = " ".join([word for word in tx.split() if 'http' not in word and not word.startswith('@')and word != 'RT'])
    data.loc[k,"text"] = li

Y = data["airline_sentiment"].apply(lambda x: 0 if x == "negative"  else (1 if (x == "nuetral") else 2))

data["airline_sentiment"].value_counts()

"""# Text mining
1. Removing Stopword, Stemming/lemmatizing, Removing all the symbols
"""

punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''

for k,tx in enumerate(data["text"]):
    res = re.sub(r'[^\w\s]', '', tx)
    data.loc[k,"text"] = ''.join([i.lower() for i in res if not i.isdigit()])

nltk.download("stopwords")

from nltk.corpus import stopwords
stops = stopwords.words('english')

for k,tx in enumerate(data["text"]):
    li = " ".join([word for word in tx.split() if word not in stops])
    data.loc[k,"text"] = li

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
ltz = WordNetLemmatizer()

for k,tx in enumerate(data["text"]):
    li = " ".join([ltz.lemmatize(word) for word in tx.split()])
    data.loc[k,"text"] = li

X = data["text"]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile

In [4]:
def onehot(arr):
    li = np.zeros((3,len(arr)))
    for i,k in enumerate(arr):
        hot = [0,0,0]
        # print(k)
        hot[k] = 1
        li[:,i] = hot
    return li

In [5]:
from  tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint

In [6]:
max_len = max([len(s.split()) for s in data["text"]])

In [7]:
token = Tokenizer()
token.fit_on_texts(data["text"])

In [8]:
vocab_size = len(token.word_index) + 1

# LSTM based Network

In [9]:
X = token.texts_to_sequences(data["text"])
X_pad = pad_sequences(X,maxlen=max_len, padding="post")
Y = data[data.columns[1]]
Y = data["airline_sentiment"].apply(lambda x: 0 if x == "negative"  else (1 if (x == "nuetral") else 2))

In [10]:
X_train,X_test,Y_train,Y_test = train_test_split(X_pad,Y, test_size=0.1, random_state = 10)

In [11]:
Y_train = onehot(Y_train)
Y_test = onehot(Y_test)

Y_test = Y_test.transpose()
Y_train = Y_train.transpose()


In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM,Bidirectional
from tensorflow.keras.optimizers import Adam

In [13]:
model = Sequential()
model.add(Embedding(vocab_size,100 , input_length=max_len))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.3))
model.add(Dense(3,activation='softmax'))
checkpoint = ModelCheckpoint('model.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
opt = Adam(learning_rate=0.001)
model.compile(loss = 'categorical_crossentropy', optimizer=opt,metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 21, 100)           1153100   
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                34048     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 3)                 195       
Total params: 1,187,343
Trainable params: 1,187,343
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
history = model.fit(X_train,Y_train, epochs=4, callbacks=[checkpoint],validation_data=(X_test,Y_test), batch_size = 64)

Epoch 1/4
Epoch 00001: val_accuracy improved from -inf to 0.82309, saving model to model.h5
Epoch 2/4
Epoch 00002: val_accuracy improved from 0.82309 to 0.82992, saving model to model.h5
Epoch 3/4
Epoch 00003: val_accuracy did not improve from 0.82992
Epoch 4/4
Epoch 00004: val_accuracy did not improve from 0.82992


# RNN based Network

In [15]:
from tensorflow.keras.layers import RNN,GRU,SimpleRNN
model = Sequential()
model.add(Embedding(vocab_size,100 , input_length=max_len))
model.add(Bidirectional(SimpleRNN(32)))
# model.add(Dropout(0.3))
model.add(Dense(10))
model.add(Dense(3,activation='softmax'))
checkpoint = ModelCheckpoint('model.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
opt = Adam(learning_rate=0.001)
model.compile(loss = 'categorical_crossentropy', optimizer=opt,metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 21, 100)           1153100   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                8512      
_________________________________________________________________
dense_1 (Dense)              (None, 10)                650       
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 33        
Total params: 1,162,295
Trainable params: 1,162,295
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
history = model.fit(X_train,Y_train, epochs=4, callbacks=[checkpoint],validation_data=(X_test,Y_test), batch_size = 64)

Epoch 1/4
Epoch 00001: val_accuracy improved from -inf to 0.81831, saving model to model.h5
Epoch 2/4
Epoch 00002: val_accuracy did not improve from 0.81831
Epoch 3/4
Epoch 00003: val_accuracy did not improve from 0.81831
Epoch 4/4
Epoch 00004: val_accuracy did not improve from 0.81831


# Word2Vec and LSTM based network

In [17]:
sentences = [[] for k in range(len(data["text"]))]
for k,tx in enumerate(data["text"]):
    sentences[k]  = tx.split()
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words)
# access vector for one word
print(model['avoid'])
# save model
model.save('model.bin')
file_name = "word2vec_embeddings.txt"
model.wv.save_word2vec_format(file_name, binary = False)
# load model
new_model = Word2Vec.load('model.bin')

Word2Vec(vocab=11535, size=100, alpha=0.025)
[-0.06572042  0.06964056  0.00193242  0.06450636 -0.08318073  0.1594165
  0.05801625  0.09684096 -0.03928953  0.02660044  0.24899054  0.02195137
  0.06058518  0.02939128  0.01899717 -0.03975121  0.12485717 -0.08042764
  0.09309425  0.02920168  0.08335529 -0.01625478  0.03025378  0.02974424
 -0.004498    0.10669902 -0.08014639  0.07856758 -0.02721353 -0.04945983
 -0.02877497 -0.03273527 -0.02452275 -0.03127172 -0.02639201  0.01452216
 -0.07027247 -0.05149049  0.02980359 -0.07765371  0.0396671   0.04006227
  0.05489695 -0.04230032  0.04569951  0.03470952  0.05098514  0.02410112
  0.04043869  0.04747094 -0.03307584  0.03556455  0.06926901  0.09420846
  0.04385651 -0.04308546  0.11398849 -0.00037347 -0.02027055 -0.00527063
  0.07613422 -0.0207675  -0.02567606 -0.07055567 -0.01108329 -0.09073705
 -0.06125797  0.03010793  0.0735274  -0.02220723 -0.08273169  0.00157015
  0.11247395 -0.00030338  0.13046905  0.02234729  0.08691587 -0.13179897
  0.008

  # This is added back by InteractiveShellApp.init_path()
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [18]:
dic = {}
for tx in data["text"]:
    for k in tx.split():
        if k not in dic:
            dic[k] = len(dic)
embedding_dict = {}
for k in dic.keys():
    embedding_dict[k] = model[k]

  


In [19]:
token = Tokenizer()
token.fit_on_texts(data["text"])
max_len = max([len(s.split()) for s in data["text"]])
vocab_size = len(token.word_index) + 1
X = token.texts_to_sequences(data["text"])
X_pad = pad_sequences(X,maxlen=max_len, padding="post")

In [20]:
word_index = len(token.word_index)
num_words = word_index + 1
embeding_mat = np.zeros((num_words,100))
for word,i in token.word_index.items():
    if(word in embedding_dict):
        embeding_mat[i] = embedding_dict[word]

In [21]:
from tensorflow.keras.initializers import Constant

In [22]:
model = Sequential()
embed = Embedding(num_words,100,embeddings_initializer=Constant(embeding_mat),input_length=max_len,trainable = True)
model.add(embed)
model.add(Bidirectional(LSTM(32)))
# model.add(Dense(10))
model.add(Dense(3,activation='softmax'))
checkpoint = ModelCheckpoint('model_word2vec.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 21, 100)           1153100   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                34048     
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 195       
Total params: 1,187,343
Trainable params: 1,187,343
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
history = model.fit(X_train,Y_train, epochs=4, callbacks=[checkpoint],validation_data=(X_test,Y_test), batch_size = 32)

Epoch 1/4
Epoch 00001: val_accuracy improved from -inf to 0.81284, saving model to model_word2vec.h5
Epoch 2/4
Epoch 00002: val_accuracy improved from 0.81284 to 0.82855, saving model to model_word2vec.h5
Epoch 3/4
Epoch 00003: val_accuracy did not improve from 0.82855
Epoch 4/4
Epoch 00004: val_accuracy did not improve from 0.82855


# Word2Vec + RNN based network    

In [24]:
model = Sequential()
embed = Embedding(num_words,100,embeddings_initializer=Constant(embeding_mat),input_length=max_len,trainable = True)
model.add(embed)
model.add(Bidirectional(SimpleRNN(10)))
# model.add(Dense(10))
model.add(Dense(3,activation='softmax'))
checkpoint = ModelCheckpoint('model_word2vec.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())
history = model.fit(X_train,Y_train, epochs=4, callbacks=[checkpoint],validation_data=(X_test,Y_test), batch_size = 32)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 21, 100)           1153100   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 20)                2220      
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 63        
Total params: 1,155,383
Trainable params: 1,155,383
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/4
Epoch 00001: val_accuracy improved from -inf to 0.78552, saving model to model_word2vec.h5
Epoch 2/4
Epoch 00002: val_accuracy improved from 0.78552 to 0.81421, saving model to model_word2vec.h5
Epoch 3/4
Epoch 00003: val_accuracy did not improve from 0.81421
Epoch 4/4
Epoch 00004: val_accuracy did not improve from 0.81421


# Unviersal Encoding Based Classification

In [25]:
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

  import pandas.util.testing as tm


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [26]:
arr_embed = embed(data["text"])

In [27]:
arr_embed.shape

TensorShape([14640, 512])

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(np.array(arr_embed),Y,test_size = 0.05, random_state= 10)

In [29]:
X_train.shape

(13908, 512)

In [30]:
Y_train = onehot(Y_train)
Y_test = onehot(Y_test)

Y_test = Y_test.transpose()
Y_train = Y_train.transpose()

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
model = Sequential()
model.add(Dense(300, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.9))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.7))
model.add(Dense(10,activation="relu"))
model.add(Dense(3,activation= "sigmoid"))
# opt =Adam(learning_rate=0.01)
checkpoint = ModelCheckpoint('use_model.h5', verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

history = model.fit(X_train, Y_train,validation_data = (X_test,Y_test), epochs=20,callbacks=[checkpoint], batch_size=32)


Epoch 1/20
Epoch 00001: val_accuracy improved from -inf to 0.81831, saving model to use_model.h5
Epoch 2/20
Epoch 00002: val_accuracy improved from 0.81831 to 0.82923, saving model to use_model.h5
Epoch 3/20
Epoch 00003: val_accuracy improved from 0.82923 to 0.83470, saving model to use_model.h5
Epoch 4/20
Epoch 00004: val_accuracy improved from 0.83470 to 0.83743, saving model to use_model.h5
Epoch 5/20
Epoch 00005: val_accuracy improved from 0.83743 to 0.83880, saving model to use_model.h5
Epoch 6/20
Epoch 00006: val_accuracy did not improve from 0.83880
Epoch 7/20
Epoch 00007: val_accuracy improved from 0.83880 to 0.84153, saving model to use_model.h5
Epoch 8/20
Epoch 00008: val_accuracy did not improve from 0.84153
Epoch 9/20
Epoch 00009: val_accuracy did not improve from 0.84153
Epoch 10/20
Epoch 00010: val_accuracy did not improve from 0.84153
Epoch 11/20
Epoch 00011: val_accuracy did not improve from 0.84153
Epoch 12/20
Epoch 00012: val_accuracy did not improve from 0.84153
Epoc

In [32]:
model.load_weights("use_model.h5")
model.evaluate(X_test,Y_test, verbose=True)



[0.36983728408813477, 0.8428961634635925]

#Highest Accuracies 
1. 85.65% USE based model
2. 84% word2vec LSTM 
3. 83.8 Only LSTM