In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from torch.utils.data import Dataset
import torch
import torch.nn as nn
from torch.nn.modules import dropout
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

In [2]:
train_df=pd.read_csv("/Users/srishtysuman/PycharmProjects/NaturalLanguageProcessing/sentiment_analysis_data/train.csv", encoding='latin1')
test_df=pd.read_csv("/Users/srishtysuman/PycharmProjects/NaturalLanguageProcessing/sentiment_analysis_data/test.csv", encoding='latin1')

# 1. get only relevant columns as train_df and test_df
train_df=train_df[['text', 'sentiment']].dropna()
test_df=test_df[['text', 'sentiment']].dropna()

# 2. for sentiment, give binary labels
def binary_labels(label):
    if label=="positive":
        return 2
    elif label=="neutral":
        return 1
    else:
        return 0
train_df["sentiment"]=train_df["sentiment"].apply(binary_labels)
test_df["sentiment"]=test_df["sentiment"].apply(binary_labels)
test_df

def clean_text(text):
    if type(text)!=str or pd.isnull(text) or text=='':
        return ''
    text=text.lower()   
    link_re_pattern = "https?:\/\/t.co/[\w]+"
    text=re.sub(link_re_pattern, '', text)
    text=re.sub("\`have", 'have', text)   
    text=re.sub("\`ve", ' have', text)   
    text = text.lower()
    text = re.sub("\'s", " ", text) 
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)
    text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE)
    text = re.sub("(\d+)(kK)", " \g<1>000 ", text)
    text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text, flags=re.IGNORECASE)
    text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
    text = re.sub("[c-fC-F]\:\/", " disk ", text)
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
    text = re.sub('\$', " dollar ", text)
    text = re.sub('\%', " percent ", text)
    text = re.sub('\&', " and ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r'\d+', '',text)
    text = re.sub('(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', '', text)
    text = text.replace("?","")
    text = text.replace("(","")
    text = text.replace(")","")
    text = text.replace('"',"")
    text = text.replace(",","")
    text = text.replace("#","")   
    text = text.replace("-","")    
    text = text.replace("..","")
    text = text.replace("/","")
    text = text.replace("\\","")
    text = text.replace(":","")
    text = text.replace("the","") 
    text=re.sub(r'[^\w\s]','',text)
    text=re.sub("(.)\\1{2,}", "\\1", text)
    text=re.sub("ii", "i", text)
    text=re.sub("_", "", text)
    text=re.sub("^http", "", text) 
    return text    
    
train_df["text"]=train_df["text"].apply(clean_text)
test_df["text"]=test_df["text"].apply(clean_text)

train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

def tokenize(df, test_df):
    word_to_index=dict()
    index_to_word=['<unk>']
    stopword=stopwords.words('english')
    count=0
    tokenize_column=[]
    for index, row in df.iterrows():
        text=row["text"]
        words=[word for word in text.split()]
        token_list=[]
        for word in words:
            if word in stopword:
                continue
            if word not in word_to_index:
                count+=1
                word_to_index[word]=count
                index_to_word.append(count)
            token_list.append(word_to_index[word])
        tokenize_column.append(token_list)
    df["text_map"]=tokenize_column

    tokenize_column=[]
    for index, row in test_df.iterrows():
        text=row["text"]
        words=[word for word in text.split()]
        token_list=[]
        for word in words:
            if word in stopword:
                continue
            if word not in word_to_index:
                token_list.append(0)
            else:
                token_list.append(word_to_index[word])
        tokenize_column.append(token_list)
    test_df["text_map"]=tokenize_column    
    return df, test_df, word_to_index, index_to_word

train_df, test_df, word_to_index, index_to_word = tokenize(train_df, test_df)

test_df = test_df[test_df['text_map'].apply(lambda x: len(x) > 0)].reset_index(drop=True)
train_df = train_df[train_df['text_map'].apply(lambda x: len(x) > 0)].reset_index(drop=True)

class TextDataloader(Dataset):
    def __init__(self, text_list, text_map, labels):
        """
        Params:
        -------
        test : list
               list with tuples of all the texts
        
        word2index : dict
                     vocbulary of the dataset
        labels : list 
                 list of the corrsponding labels to the question pairs 
        
        """
        self.text_list = text_list
        self.text_map = text_map
        self.labels = labels
        
    def __len__(self):
        return len(self.text_list)
    
    def __getitem__(self, index):
        text_map = self.text_map[index]
        text = self.text_list[index]
        text="".join(word for word in text)
            
        # q1_indices and q2_indices are lists of indices against words used in the sentence 
        return {
            'text': text,
            'text_map': text_map, 
            'labels': self.labels[index], 
        }
    
def data_to_tuple(df):
    text=df["text"].tolist()
    text_map=df["text_map"].tolist()
    labels=df["sentiment"].tolist()
    return text, text_map, labels

train_text, train_text_map, train_labels=data_to_tuple(train_df)
test_text, test_text_map, test_labels=data_to_tuple(test_df)

train_dataset=TextDataloader(train_text, train_text_map, train_labels)
test_dataset=TextDataloader(test_text, test_text_map, test_labels)

print(len(test_dataset))
for sample in test_dataset:
    print(sample["text"])
    print(sample["text_map"])
    print(sample["labels"])   
    break

def collate_fn(batch):
    text = []
    text_map = []
    labels = []
    for item in batch:
        text.append(item['text'])
        text_map.append(item['text_map'])
        labels.append(item['labels'])
          
        
    text_lengths = [len(q) for q in text_map]

    sorted_indices=np.flipud(np.argsort(text_lengths))
    lengths=np.flipud(np.sort(text_lengths))
    lengths = lengths.copy()

    sorted_text = [text[i] for i in sorted_indices]
    sorted_texts_map = [torch.LongTensor(text_map[i]).to('cpu') for i in sorted_indices]
    sorted_labels = [labels[i] for i in sorted_indices]

    sorted_texts_map = pad_sequence(sorted_texts_map, batch_first=True)    

    return {
        'text': sorted_text,
        'text_map': sorted_texts_map, 
        'text_lengths': lengths,
        'labels': sorted_labels
    }

train_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=512, collate_fn=collate_fn)
test_dataloader=torch.utils.data.DataLoader(test_dataset, batch_size=512, collate_fn=collate_fn)

embedding_dim=300
embeddings=torch.randn(len(index_to_word), 300)
embeddings[0] = torch.zeros(embedding_dim)



3522
last session of  day  httptwitpiccomezh
[228, 3303, 229, 0]
1


In [155]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention).__init__()
    def forward(self, lstm_output, final_state):
        weights=torch.bmn(lstm_output, final_state.squeeze(0).unsqueeze(2))
        weights=F.softmax(weights.squeeze(2), dim=1).unsqueeze(2)
        context_vector=torch.bmn(torch.transpose(weights, 1,2), lstm_output)

class EmbeddingLSTMNet(nn.Module):
    def __init__(self, embedding_dim, hidden_cells, num_layers, embedding_grad, embeddings, dropout, num_classes):
        super(EmbeddingLSTMNet, self).__init__()
        self.device='cpu'
        self.embedding=nn.Embedding.from_pretrained(embeddings)
        self.embedding.weight.requires_grad=embedding_grad
        self.dropout=nn.Dropout(dropout)

        self.encoder_lstm=nn.LSTM(input_size=embedding_dim, hidden_size=hidden_cells, num_layers=num_layers, batch_first=True)
        self.decoder_lstm=nn.LSTM(input_size=embedding_dim, hidden_size=hidden_cells, num_layers=num_layers, batch_first=True)

        self.fc1=nn.Linear(hidden_cells, hidden_cells)
        self.relu1=nn.ReLU()      
        self.batch_norm1=nn.BatchNorm1d(hidden_cells)

        self.fc2=nn.Linear(hidden_cells, num_classes)        
        self.final_softmax=nn.Softmax()
        self.final_layer=nn.Sigmoid()

        self.hidden_cells=hidden_cells
        # self.attention=Attention()

    def attention(self, lstm_output, final_state):
        merged_state = final_state.squeeze(0).unsqueeze(2)
        weights = torch.bmm(lstm_output, merged_state)
        weights = F.softmax(weights.squeeze(2), dim=1).unsqueeze(2)
        context_vector=torch.bmm(torch.transpose(lstm_output, 1, 2), weights).squeeze(2)
        return context_vector, weights
    
    def forward(self, texts, texts_map, lengths):
        embeddings=self.embedding(texts_map).to(self.device)
        embeddings=self.dropout(embeddings)
        out, (hn, cn) = self.encoder_lstm(embeddings)

        context_vector,attn_weights = self.attention(out, hn)
        context_vector=context_vector.unsqueeze(0)
        out = self.fc2(self.relu1(self.fc1(torch.cat([context_vector, hn]))))
        return out[0]

model = EmbeddingLSTMNet(embedding_dim=300, hidden_cells=100, num_layers=1, embedding_grad=True, embeddings=embeddings, dropout=0.0, num_classes=3)
print(model.parameters)

for i, batch in enumerate(train_dataloader):
    # print("epoch, i", epoch, i)
    text_map, text_lengths = batch['text_map'], batch['text_lengths']
    y = torch.tensor(batch['labels'])
    y_pred = model(batch["text"], text_map, text_lengths)
    loss=nn.CrossEntropyLoss()(y_pred, y)




<bound method Module.parameters of EmbeddingLSTMNet(
  (embedding): Embedding(30071, 300)
  (dropout): Dropout(p=0.0, inplace=False)
  (encoder_lstm): LSTM(300, 100, batch_first=True)
  (decoder_lstm): LSTM(300, 100, batch_first=True)
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (relu1): ReLU()
  (batch_norm1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=100, out_features=3, bias=True)
  (final_softmax): Softmax(dim=None)
  (final_layer): Sigmoid()
)>


In [116]:
final_state=torch.randn((3, 6, 1))
print(final_state.shape)
torch.transpose(final_state, 1, 2).shape

torch.Size([3, 6, 1])


torch.Size([3, 1, 6])

In [110]:
final_state=torch.randn((3, 6, 1))
print(final_state.shape)
print(final_state.squeeze(2).shape)
print(final_state.squeeze(2))
F.softmax(final_state.squeeze(2), dim=1).unsqueeze(2)

torch.Size([3, 6, 1])
torch.Size([3, 6])
tensor([[ 1.4574, -0.5846,  0.0139,  0.8708, -2.2888, -0.2166],
        [-1.7905,  0.0857, -1.0503, -0.6506,  1.1368,  0.9351],
        [-1.2959, -0.1056,  0.3226, -0.2864, -0.5587,  2.0600]])


tensor([[[0.4688],
         [0.0608],
         [0.1107],
         [0.2608],
         [0.0111],
         [0.0879]],

        [[0.0214],
         [0.1398],
         [0.0449],
         [0.0670],
         [0.4000],
         [0.3269]],

        [[0.0233],
         [0.0768],
         [0.1178],
         [0.0641],
         [0.0488],
         [0.6693]]])

In [56]:
final_state=torch.randn((3, 4, 6))
final_state

tensor([[[-0.1978,  0.3541,  0.2221, -1.1079, -1.6475, -0.1731],
         [ 0.4080,  0.8847,  1.3301, -1.0797,  0.3881, -2.5418],
         [ 1.1926,  0.2362, -0.4385, -0.5055, -0.9163,  0.6483],
         [ 0.3059, -0.5429,  0.1005,  0.9566,  1.0771, -0.1521]],

        [[ 0.4881,  2.5263, -0.9686, -0.2728,  1.8135, -0.0840],
         [-0.0694,  0.7219,  1.5815, -0.8289,  0.1787, -1.2577],
         [ 1.0936,  1.0060, -1.1942, -0.6852,  1.4193,  0.5340],
         [-0.2234, -0.4967, -2.1705,  0.0418, -1.1296, -2.2058]],

        [[ 1.2080,  0.6784, -0.5838,  0.0243,  0.9939, -0.5591],
         [-0.3657, -1.9982, -0.4795,  1.4495, -1.2987,  0.2616],
         [-1.1428,  0.3866, -0.9400,  1.4429, -0.7112, -0.5274],
         [-0.2490, -0.1509, -0.6339,  0.8252, -1.4773,  0.1485]]])

In [79]:
[s for s in final_state]

[tensor([[-0.1978,  0.3541,  0.2221, -1.1079, -1.6475, -0.1731],
         [ 0.4080,  0.8847,  1.3301, -1.0797,  0.3881, -2.5418],
         [ 1.1926,  0.2362, -0.4385, -0.5055, -0.9163,  0.6483],
         [ 0.3059, -0.5429,  0.1005,  0.9566,  1.0771, -0.1521]]),
 tensor([[ 0.4881,  2.5263, -0.9686, -0.2728,  1.8135, -0.0840],
         [-0.0694,  0.7219,  1.5815, -0.8289,  0.1787, -1.2577],
         [ 1.0936,  1.0060, -1.1942, -0.6852,  1.4193,  0.5340],
         [-0.2234, -0.4967, -2.1705,  0.0418, -1.1296, -2.2058]]),
 tensor([[ 1.2080,  0.6784, -0.5838,  0.0243,  0.9939, -0.5591],
         [-0.3657, -1.9982, -0.4795,  1.4495, -1.2987,  0.2616],
         [-1.1428,  0.3866, -0.9400,  1.4429, -0.7112, -0.5274],
         [-0.2490, -0.1509, -0.6339,  0.8252, -1.4773,  0.1485]])]

In [66]:
final_state.shape

torch.Size([3, 4, 6])

In [82]:
merged_state = torch.cat([s for s in final_state],1)
merged_state.shape
print("after cat", merged_state.shape)

torch.Size([4, 18])

In [92]:
merged_state.squeeze(0).unsqueeze(2).shape

torch.Size([4, 18, 1])

In [55]:

merged_state = merged_state.squeeze(0).unsqueeze(2)
print("squeeze unsqueeze", merged_state.shape)


tensor([[[-2.1435,  1.1681, -1.0501,  0.5218, -0.5127, -0.1476],
         [ 0.4469,  0.7750,  0.2806,  1.3379, -1.5390,  0.5023],
         [-1.1041, -1.3331, -0.2965, -0.8033,  1.3141,  0.3061],
         [-0.7461, -0.9973,  0.1945, -1.9226,  0.9130, -1.4198]],

        [[-0.4426,  1.0650,  0.3200, -1.0377,  0.5185,  0.8102],
         [ 0.0462,  0.7288,  0.3910,  0.9335, -0.1256, -1.6559],
         [ 0.7464,  0.8905, -0.3261, -0.7684,  0.7065, -0.4252],
         [ 0.6556, -0.5605,  0.3575,  2.0854, -1.0509,  1.3937]],

        [[ 0.0108, -0.8162,  1.8186, -2.1920, -0.0894,  0.7893],
         [-0.0477,  2.0034, -0.2787, -0.7739,  0.6469, -1.2293],
         [-0.0465, -0.2566,  0.6707,  0.9170, -1.4451,  0.0025],
         [-1.7753,  0.2129,  2.3884,  0.9750, -0.2691,  0.2106]]])

In [None]:
loss_history=[]
for epoch in range(15):
    print("epoch", epoch)
    optimizer=torch.optim.Adam(model.parameters(), lr=0.001)
    losses=[]
    for i, batch in enumerate(train_dataloader):
        # print("epoch, i", epoch, i)
        text_map, text_lengths = batch['text_map'], batch['text_lengths']
        y = torch.tensor(batch['labels'])
        y_pred = model(batch["text"], text_map, text_lengths)
        loss=nn.CrossEntropyLoss()(y_pred, y)
        # print(y_pred.shape, y.shape)
        # print(y_pred)
        y_pred_class=[]
        for list in y_pred:
            y_pred_class.append(torch.argmax(list).detach().numpy().item())   
        correct = (torch.tensor(y_pred_class) == torch.tensor(y)).sum().item()
        
        # print(loss)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    loss_history.append(sum(losses)/len(losses))
print(loss_history)
    # print(y_pred)
