In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import nltk
import torch
import torch.nn as nn
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader,TensorDataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [None]:
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu' )
device

In [None]:
tweets=pd.read_csv('/kaggle/input/tweetsentimentanalysis/Tweets.csv')
tweets.head(5)

In [None]:
tweets.info()

In [None]:
sns.countplot(tweets['airline_sentiment'])

In [None]:
data=tweets['text'].values
labels=tweets['airline_sentiment'].values
le=LabelEncoder()
label=le.fit_transform(labels)
x_train,x_test,y_train,y_test=train_test_split(data,label,test_size=0.10,random_state=42)

TEXT PREPROCESSING

In [None]:
import re
def preprocessing(s):
    # Remove all non-word characters (everything except numbers and letters)
    s = re.sub(r"[^\w\s]", '', s)
    # Replace all runs of whitespaces with no space
    s = re.sub(r"\s+", '', s)
    # replace digits with no space
    s = re.sub(r"\d", '', s)
    
    return s

In [None]:
from nltk.corpus import stopwords

def tokenizer(x_train,x_test):
    word_list=[]
    stop_word=set(stopwords.words('english'))
    for sent in x_train:
        sent=sent.lower().split()[1:]
        for word in sent:
            word=preprocessing(word)
            if word not in stop_word and word != '' : 
                    word_list.append(word)
                
                
    corpus=Counter(word_list)     
    corpus_=sorted(corpus,key=corpus.get,reverse=True)[:1500]
    one_hot_dict={w:i+1 for i,w in enumerate(corpus_)}
    
    train_list=[]
    test_list=[] 
    for  sent in  x_train:   
         train_list.append( [one_hot_dict[preprocessing(word)] for word in sent.lower().split()[1:]
                           if preprocessing(word) in one_hot_dict.keys()])
        
    for sent in x_test :
        test_list.append( [one_hot_dict[preprocessing(word)]  for word in sent.lower().split()[1:]
                            if preprocessing(word) in one_hot_dict.keys()])
    
    
    
    return  train_list,test_list,one_hot_dict
        
x_tr,x_ts,vocab=tokenizer(x_train,x_test)

In [None]:
#Determine max length from expressions
a=[len(x) for x in x_tr]
max_len_index=a.index(max(a))
len(x_tr[max_len_index])

In [None]:
#padding used to same size tensors
pad=25
def padding(data,padd):
    for i,sent in enumerate(data):
        feature=np.zeros((1,padd),dtype=int) 
        if len(np.array(sent)) != 0:
            feature[:,-len(sent):]=np.array(sent)
            data[i]=feature
        else :
            data[i]=feature
    return data

X_train=padding(x_tr,pad) # the longest expression length is 18, we set the size to 25
X_test=padding(x_ts,pad)

In [None]:
class sentimentLSTM(nn.Module):
    def __init__(self,num_layers,batch_size,hidden_size,vocab_size,embed_size,p,pad):
        super(sentimentLSTM,self).__init__()
        
        self.hidden=hidden_size
        self.embed=nn.Embedding(vocab_size,embed_size)
        self.embed_size=embed_size
        self.num_layers=num_layers
        self.p=p
        self.pad=pad
        self.lstm=nn.LSTM(input_size=self.embed_size,
                         hidden_size=self.hidden,
                         num_layers=self.num_layers,
                         batch_first=True)
        

        
        
        self.linear=nn.Linear(self.hidden,3)
        self.drop=nn.Dropout(self.p)
        self.soft=nn.Softmax()
        self.relu=nn.ReLU()
    

    
    def forward(self,x,hidden):
        
        batch=x.shape[0]
        x=x.view(batch,-1)
        x=self.embed(x)
        x,hidden=self.lstm(x)
        x=x.contiguous().view(-1, self.hidden)
        x=self.drop(x)
        x=self.relu(x)
        x=self.linear(x)  
        
        out=x.view(batch,self.pad, -1)   
        return out[:,-1] ,h

    def init_hidden(self,b):

        h0 = torch.zeros((self.num_layers,batch_size,self.hidden)).to(device)
        c0 = torch.zeros((self.num_layers,batch_size,self.hidden)).to(device)
        hidden = (h0,c0)
        return hidden

In [None]:
epochs=20
lr=0.001
hidden_size=512
vocab_size=len(vocab)+1
embed_size=300
num_layers=1
batch_size=128
p=0.5
model=sentimentLSTM(num_layers,batch_size,hidden_size,vocab_size,embed_size,p,pad).to(device)
criterion=nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


train_data = TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train))
test_data = TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test))

train_loader=DataLoader(dataset=train_data,
                        shuffle=True,
                        batch_size=batch_size)

test_loader=DataLoader(dataset=test_data,
                        shuffle=True,
                        batch_size=batch_size)

soft=nn.Softmax(dim=1) #using for normalization 0-1 

train_losses=[]
test_losses=[]

for epoch in range(epochs):
    train_loss=0
    train_pred=[]
    h = model.init_hidden(batch_size)
    labels=[]
    model.train()
    for e,(inputs,label) in enumerate(train_loader):
        
        inputs=inputs.type(torch.LongTensor).to(device)
        label=label.type(torch.LongTensor).to(device)
        
        h = tuple([each.data for each in h])
        pred,h=model(inputs,h)

        loss=criterion(pred,label)
        train_loss+=loss.item()
        
        model.zero_grad()
        loss.backward()
        optimizer.step()
        
        labels.append(np.array(label.cpu()))
        train_pred.append(np.array([torch.argmax(i).item() for i in soft(pred)]))
        
    train_losses.append(train_loss/e)    
    acc_train=accuracy_score(np.concatenate(labels),np.concatenate(train_pred))    

    model.eval() 
    test_loss=0
    test_preds=[]
    test_labels=[]
    
    for t,(input_test,label_test) in enumerate(test_loader):
        
        input_test=input_test.type(torch.LongTensor).to(device)
        label_test=label_test.type(torch.LongTensor).to(device)
        
        h = tuple([each.data for each in h])
        pred_test,h=model(input_test,h)

        loss_test=criterion(pred_test,label_test)
        test_loss+=loss_test.item()
        
        
        test_labels.append(np.array(label_test.cpu()))
        test_preds.append(np.array([torch.argmax(i).item() for i in soft(pred_test)]))
        
    
    test_losses.append(test_loss/t)
    acc_test=accuracy_score(np.concatenate(test_labels),np.concatenate(test_preds))
    print('epochs {} Train_accurary {:.3f},train error {:.3f} , Test_accuracy {:.3f} Test_error {:.3f}'.format(epoch,acc_train,(train_loss/e),acc_test,(test_loss/t)))
        
    

NOT: model has overfit to be fixed :)

In [None]:
## Plot the graph here
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.plot(train_losses ,label='Train Loss')
plt.plot(test_losses,label='Test Loss')
plt.legend(frameon=True);

In [None]:

#we enter the model with an expression and observe the result
text='everything was good'

#text preprocessing 

word_seq = np.array([vocab[preprocessing(word)] for word in text.lower().split() 
                 if preprocessing(word)  in vocab.keys()])
feature=np.zeros((1,25),dtype=int)
feature[:,-len(word_seq):]=np.array(word_seq)
inputs =  torch.from_numpy(feature).type(torch.LongTensor).to(device)

model.eval()
with torch.no_grad():
    h = model.init_hidden(batch_size)
    h = tuple([each.data for each in h])
    output, h = model(inputs, h)

print(le.inverse_transform([torch.argmax(soft(output)).item()])[0])

In [None]:
# because of overfiting we need to parametre tuning

from itertools import product

p=[ 0.2, 0.3, 0.4, 0.6, 0.7, 0.8]
wd=[0.01,0.001,0.0001]
lr=[0.01,0.001,0.0001]
#hidden_size=[32,64,128,512,1024]
embed_size=[32,64,128,256]
batch_size=[32,64,128,256]
parameters=pd.DataFrame(
    list(product(p,wd,lr, embed_size,batch_size)),
    columns=['p','wd','lr', 'embed','batch']
)

parameters

In [None]:

# code using for  model parameters tuningtrain=[]

test=[]
epochs_tr={}
epochs_ts={}
for  index,i in enumerate(np.array(parameters)):

    p=  float(i[0])
    wd= float(i[1])
    lr= float(i[2])
    embed_size=int(i[3])
    batch_size=int(i[4])
             
    train_data = TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train))
    test_data = TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test))

    train_loader=DataLoader(dataset=train_data,
                        shuffle=True,
                        batch_size=batch_size)
    test_loader=DataLoader(dataset=test_data,
                        shuffle=True,
                        batch_size=batch_size)

    param_acc={}
    hidden_size=512
    vocab_size=len(vocab)+1
    num_layers=1
    model=sentimentLSTM(num_layers,batch_size,hidden_size,vocab_size,embed_size,p).to(device)
    criterion=nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)


    
    soft=nn.Softmax(dim=1)

    train_l=[]
    test_l=[]

    for epoch in range(20):
        train_loss=0
        train_pred=[]
        h = model.init_hidden(batch_size)
        labels=[]
        model.train()
        for e,(inputs,label) in enumerate(train_loader):

            inputs=inputs.type(torch.LongTensor).to(device)
            label=label.type(torch.LongTensor).to(device)

            h = tuple([each.data for each in h])
            pred,h=model(inputs,h)

            loss=criterion(pred,label)
            train_loss+=loss.item()

            model.zero_grad()
            loss.backward()
            optimizer.step()

            labels.append(np.array(label.cpu()))
            train_pred.append(np.array([torch.argmax(i).item() for i in soft(pred)]))

        train_l.append(train_loss/e)
        
        acc_train=accuracy_score(np.concatenate(labels),np.concatenate(train_pred))

      #  print('epochs {} accurary {:.3f},tset error {:.5f}'.format(i,acc_train,(train_loss/e)))


        model.eval() 
        test_loss=0
        test_preds=[]
        test_labels=[]

        for t,(input_test,label_test) in enumerate(test_loader):

            input_test=input_test.type(torch.LongTensor).to(device)
            label_test=label_test.type(torch.LongTensor).to(device)

            h = tuple([each.data for each in h])
            pred_test,h=model(input_test,h)

            loss_test=criterion(pred_test,label_test)
            test_loss+=loss_test.item()


            test_labels.append(np.array(label_test.cpu()))
            test_preds.append(np.array([torch.argmax(i).item() for i in soft(pred_test)]))


        test_l.append(test_loss/t)
        acc_test=accuracy_score(np.concatenate(test_labels),np.concatenate(test_preds))
     #   print('epochs {} Train_accurary {:.3f},train error {:.5f} , Test_accuracy {:.3f} Test_error {:.5f}'.format(epoch,acc_train,(train_loss/e),acc_test,(test_loss/t)))
        train.append({index: {'epoch'+str(epoch):acc_train}})
        test.append({index:  {'epoch'+str(epoch):acc_test}})
        