In [1]:
import nltk
import pandas
import re
import numpy
import string

In [2]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split

In [4]:
data = pandas.read_csv('FinalTweetList.csv')
data.reset_index(inplace=True)
data.drop(columns='Response', inplace=True)
data.rename(columns={'index':'Tweets', 'Tweet':'Responses'}, inplace=True)
data.Responses.replace(['displeasure', 'compliment', 'miscellaneous'], [0, 1, 2], inplace=True)
data

Unnamed: 0,Tweets,Responses
0,@ICICIBank_Care Bank employees of 21 c Faridab...,0
1,@AxisBankSupport 18..233.... is permanently bu...,0
2,"@TheOfficialSBI 3 hours in line,still waiting....",0
3,@HDFCBank_Cares Both the ATM machines here in ...,0
4,@ICICIBank Thanks icici bank I got case in you...,2
...,...,...
5689,@HDFCBank_Cares over 25 calls this week.I gues...,0
5690,@ICICIBank_Care e-Pay slip as system not worki...,0
5691,@TheOfficialSBI staff at Sbi Akaltara branch C...,2
5692,"@singhraviranja4 @AxisBankSupport Hi, sorry fo...",2


In [5]:
banklist = ['sbi', 'hdfc', 'axis', 'icici', 'kotak', 'citi', 'boi',
                 'bankofbaroda', 'allahabad', 'indus', 'oriental',
                 'idbi', 'vijaya', 'union']

In [6]:
def assign_banknames(banknames, banklist):
    final_list = []
    for idx,xyz in enumerate(banknames):
        for name in banklist:
            if name in xyz:
                final_list.append(name)
    return list(set(final_list))

In [7]:
banks = []
vocab = []

#pos.open('positive.txt', 'w')
#neg.open('negative.txt', 'w')

for idx,row in data.iterrows():
    tweet = row['Tweets']
    usernames = [temp[1:] for temp in re.findall(r'\B@\w+', tweet)] 
    
    tweet = [temp for temp in tweet.split() if not temp.startswith('https')]
    tweet = ' '.join([word for word in tweet if word[1:] not in usernames])
    tweet = re.sub('[!#?,.:";]', ' ', tweet)
    tweet = ' '.join([word.lower() for word in tweet.split() if word.lower() not in stop_words 
                      and len(word) > 2 and word.isalpha()])

    banknames = [username.lower() for username in usernames 
                 if ('Bank' in username) or ('SBI' in username)
                 or ('bank' in username) or ('RBI' in username)
                 or ('sbi' in username)]
    banknames = ' '.join(assign_banknames(banknames, banklist))
    
    data['Tweets'][idx] = tweet + ' ' + banknames
    vocab.extend(data['Tweets'][idx].split())
    
    if len(banknames) > 0 and row['Responses'] != 2:
        banks.append(banknames)
    else:
        data.drop(idx, inplace=True)

data.reset_index(drop=True, inplace=True)
data['Banks'] = pandas.Series(numpy.array(banks), index=data.index)

data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Tweets,Responses,Banks
0,bank employees faridabad making fools customer...,0,icici
1,permanently busy cannot beyond language select...,0,axis
2,hours line still waiting wish staff cld work e...,0,sbi
3,atm machines times india noida office unable d...,0,hdfc
4,assam banks exchanging exchanged yesterday tod...,0,theofficialsbi
...,...,...,...
3420,see image says found sbi,0,sbi
3421,thanks hdfc,1,hdfc
3422,calls week guess ull take rejection well stop ...,0,hdfc
3423,slip system working sec dwarka really khyaalaa...,0,icici


In [8]:
all_words = []
all_tweets = []
all_responses = []
for _,row in data.iterrows():
    tweet = row['Tweets']
    all_tweets.append(tweet)
    all_responses.append(row['Responses'])
    all_words.extend(tweet.split())

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
encoded_tweets = cv.fit_transform(all_tweets).toarray()

In [10]:
x_train, x_test, y_train, y_test = train_test_split(encoded_tweets, all_responses, test_size=0.1, shuffle=True)

In [11]:
temp = []
for arr in x_train:
    temp.append(numpy.array([arr]))
x_train = numpy.array(temp)

temp = []
for arr in x_test:
    temp.append(numpy.array([arr]))
x_test = numpy.array(temp)

In [12]:
print(x_test.shape)
print(x_train.shape)

(343, 1, 5511)
(3082, 1, 5511)


In [13]:
x_train = torch.from_numpy(x_train)
y_train = torch.from_numpy(numpy.array(y_train))
x_val = torch.from_numpy(x_test)
y_val = torch.from_numpy(numpy.array(y_test))

In [14]:
device = torch.device('cpu')
input_size = 5511
hidden_size = 128
num_layers = 3

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.func = nn.Sequential(
            nn.Linear(hidden_size, 32), nn.ReLU(),
            nn.Linear(32, 1), nn.Sigmoid())

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        lstm_out, _ = self.lstm(x, (h0, c0))
        out = self.func(lstm_out[:, -1, :])
        return out

In [15]:
model = LSTM(input_size, hidden_size, num_layers).to(device)
model = model.float()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
num_epochs = 50
for epoch in range(num_epochs):
    
    x_train = x_train.to(device).float()
    y_train = y_train.to(device).float()
    x_val = x_val.to(device).float()
    y_val = y_val.to(device).float()
    
    output = model(x_train.float())
    loss = criterion(output, y_train)
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    correct_prediction = 0
    total = 0
    
    
    val_output = model(x_val)
    val_loss = criterion(val_output, y_val)

    _, prediction = torch.max(val_output.data, 1)

    total += y_val.size(0)
    correct_prediction += (prediction == y_val).sum().item()

    val_accuracy = (correct_prediction / total) * 100
    print('Epoch [{}/{}], Loss: {:.4f}, Val Loss: {:.4f}'
           .format(epoch+1, num_epochs, loss.item(), val_loss.item()))


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch [1/50], Loss: 0.6424, Val Loss: 0.6464
Epoch [2/50], Loss: 0.6400, Val Loss: 0.6444
Epoch [3/50], Loss: 0.6377, Val Loss: 0.6423
Epoch [4/50], Loss: 0.6352, Val Loss: 0.6401
Epoch [5/50], Loss: 0.6328, Val Loss: 0.6379
Epoch [6/50], Loss: 0.6302, Val Loss: 0.6357
Epoch [7/50], Loss: 0.6276, Val Loss: 0.6334
Epoch [8/50], Loss: 0.6250, Val Loss: 0.6312
Epoch [9/50], Loss: 0.6223, Val Loss: 0.6287
Epoch [10/50], Loss: 0.6194, Val Loss: 0.6261
Epoch [11/50], Loss: 0.6163, Val Loss: 0.6232
Epoch [12/50], Loss: 0.6129, Val Loss: 0.6200
Epoch [13/50], Loss: 0.6090, Val Loss: 0.6166
Epoch [14/50], Loss: 0.6049, Val Loss: 0.6128
Epoch [15/50], Loss: 0.6002, Val Loss: 0.6085
Epoch [16/50], Loss: 0.5949, Val Loss: 0.6037
Epoch [17/50], Loss: 0.5890, Val Loss: 0.5982
Epoch [18/50], Loss: 0.5822, Val Loss: 0.5920
Epoch [19/50], Loss: 0.5743, Val Loss: 0.5848
Epoch [20/50], Loss: 0.5651, Val Loss: 0.5766
Epoch [21/50], Loss: 0.5546, Val Loss: 0.5673
Epoch [22/50], Loss: 0.5425, Val Loss: 0.55