In [None]:
import pandas as pd
import numpy as np
import os
from collections import Counter
import pickle
from tqdm import tqdm

# Data Analysis

In [2]:
data = pd.read_csv("train.csv",engine = 'python')

In [3]:
data = data.drop(["Host", "Link", "Date(ET)", "Time(ET)", "time(GMT)"], axis=1)

In [4]:
# repalcing FACEBOOK to Facebook 
data.replace(to_replace='FACEBOOK', value='Facebook',inplace=True)
# Now there are only 4 different values in "Source" column

In [5]:
replace_ = {}
for index, i in enumerate(list(set(data["Source"])),start=1):
    replace_[index] = i
    data.replace(to_replace=i, value=index,inplace=True)
data.fillna('UNK',inplace=True)

In [6]:
list(set(data["Source"]))

[1, 2, 3, 4]

In [7]:
data.fillna('unk',inplace=True)

# Vocab creation

In [8]:
import re

In [9]:
rep_with = ['.', '?', '/', '\n', '(', ')','[', ']', '{', '}', '-','"','!', '|' ]

In [10]:
def rep_(sent):
    for i in rep_with:
        sent = sent.replace(i,' ').replace('$', ' ').replace(',','').replace("'",'')
    return sent

In [11]:
import re
import num2words

def n2w(text):
    return re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), text)

In [12]:
def preprocess(data,pos):
    sent = []
    for i in range(len(data)):
        try:sent.append(n2w(rep_(data.iloc[i,pos])).replace('UNK', 'unk'))
        except:print(data.iloc[i,pos])
    return sent

In [13]:
sent = preprocess(data, 2)

In [14]:
words_t = [words for i in sent for words in i.split(' ') if words]

In [15]:
vocab = list(set(words_t))
vocab.insert(0,'pad') # adding padding token in vocab
# vocab.insert(1,'unk') # adding unk token in vocab
stoi={i:j for j,i in enumerate(vocab)}

In [16]:
len(vocab)

30418

In [17]:
x = [ [stoi[j] for j in i.split(' ') if j] for i in sent ] # senteces
y = [ [value] for value in data.loc[:,'Patient_Tag']  ] # emphases probability

In [18]:
max([len(i) for i in x]) # maximum length of the sentence

2765

In [19]:
train = [ [x[i], y[i]] for i in range(len(x))] # combined x and y for training in a list

In [20]:
train = sorted(train, key = lambda c: len(c[0])) # sorted train using the len of x {helps in minibatching}

In [21]:
train_len = []  # len of each example
for i in train:
    train_len.append(len(i[0]))

# Creating Glove embeddings

In [None]:
with open("glove.6B.300d.txt", "r") as f:
    embed = f.read()

In [None]:
embed = embed.split('\n')

In [None]:
dict_embed = {}
for i in embed:
    dict_embed[i.split(' ')[0]] = i.split(' ')[1:]

In [None]:
weights_matrix = np.random.normal(scale = 1e-6,size = (len(vocab), 300))
words_found = 0
for index, word in enumerate(vocab):
    try:
        weights_matrix[index] = dict_embed[word.lower()]
        words_found += 1
    except:pass
print("% of words found :-",words_found/len(vocab)*100)

In [None]:
weights_matrix.shape

In [None]:
with open('embed_train.pkl', "wb") as f:
    pickle.dump(weights_matrix,f)

In [22]:
with open('embed_train.pkl', "rb") as f:
    weights_matrix = pickle.load(f)

# Creating Embedding layer

In [23]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [74]:
import torch
import torch.nn as nn
device = torch.device("cuda")

In [75]:
def create_emb_layer(weights_matrix, trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if trainable:emb_layer.weight.requires_grad = True
    else:emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim

In [104]:
class EncoderRNN(nn.Module):
    def __init__(self,embedding, hidden_size, num_layers,directions,bidirectonal,out):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.directions = directions
        self.embedding, num_embeddings, embedding_dim = embedding
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers,bidirectional=bidirectonal)
        self.linear1 = nn.Linear(hidden_size*directions, 32, bias=False)
        self.linear2 = nn.Linear(32, 1, bias=False)
        self.drop = nn.Dropout(p=0.8, inplace=False)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, inp, hidden):
        out = self.embedding(inp)
        out,_ = self.gru(out,hidden)
        out = self.drop(out)
        out = self.linear1(out.view(out.shape[1],out.shape[0],-1))
#         out = self.drop(out)
        out = self.linear2(out)
        out = self.drop(out)
        out = self.sigmoid(out)
        return out

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers*self.directions, batch_size, self.hidden_size,dtype=torch.double)

In [105]:
embedding = create_emb_layer(torch.from_numpy(weights_matrix), trainable=True)
print("embed layer is trainable or no :----",embedding[0].weight.requires_grad)

embed layer is trainable or no :---- True


In [106]:
model = EncoderRNN(embedding,hidden_size=256,num_layers=1,directions=2,bidirectonal=True,out=1)

In [107]:
model.to(device).double()

EncoderRNN(
  (embedding): Embedding(30418, 300, padding_idx=0)
  (gru): GRU(300, 256, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=32, bias=False)
  (linear2): Linear(in_features=32, out_features=1, bias=False)
  (drop): Dropout(p=0.8, inplace=False)
  (sigmoid): Sigmoid()
)

In [108]:
from torch.nn.utils.rnn import pad_sequence
import random
# x = pad_sequence([torch.tensor(i) for i in x])
# y = pad_sequence([torch.tensor(i) for i in y])

In [109]:
x_ = random.sample([i[0] for i in train],len(train))
y_ = random.sample([i[1] for i in train],len(train))

In [124]:
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [111]:
x = x_[:1000]
y = y_[:1000]

x_test = x_[1000:]
y_test = y_[1000:]

In [123]:
batch = 32
for epoch in range(5):
    running_loss, loss = 0, 0
    # training
    for i in range(0,len(y),batch):
        target = torch.tensor([i for i in y[i:i+batch]],dtype=torch.double).to(device)
        inp = pad_sequence([torch.tensor(i,dtype=torch.long) for i in x[i:i+batch]]).to(device)
        
        h = model.init_hidden(len(target)).to(device)
        out = model(inp,h)
        
        loss = loss_function(out[:,-1,:],target)
        nn.utils.clip_grad_norm_(model.parameters(), 50)
        running_loss+=loss.item()
        
        loss.backward()
        optimizer.step()
    print("loss", running_loss/len(y))
    
    #dev set
    batch_ = 1
    with torch.no_grad():
        loss = 0
        for i in range(0,len(y_test),batch_):
            target = torch.tensor(y_test[i],dtype=torch.double).to(device)
            inp = torch.tensor(x_test[i],dtype=torch.long).view(-1,1).to(device)

            h = model.init_hidden(len(target)).to(device)
            out = model(inp,h)

            loss += loss_function(out[:,-1,:],target).item()
        print(loss/len(y_test))

loss 0.027296849604358495
0.6669255765436917
loss 0.028967292758656928
0.6617377239364969
loss 0.028176301968283562
0.6697776038402011
loss 0.025090629075071223
0.6721837370899266
loss 0.025526745046052582
0.6697707296297747


# Deep learning tehcniques does not work better
# two reasons being the size of data is small and the length of sente4nces are too big for SOTA NLP techniques
# Bert cuts the length to 512 tokens. therefore did not use it

# Testing

In [125]:
data = pd.read_csv("test.csv",engine='python')

data = data.drop(["Host", "Link", "Date(ET)", "Time(ET)", "time(GMT)", "Unnamed: 9", "Index"], axis=1)
data.replace(to_replace='FACEBOOK', value='Facebook',inplace=True)
replace_ = {}
for index, i in enumerate(list(set(data["Source"])),start=1):
    replace_[index] = i
    data.replace(to_replace=i, value=index,inplace=True)
data.fillna('UNK',inplace=True)

In [126]:
data.head()

Unnamed: 0,Source,Title,TRANS_CONV_TEXT
0,3,UNK,"Baby Slice, the son of the late Kimbo Slice, h..."
1,2,My Story --,"<p><font face=""sans-serif"" size=""3"">I have had..."
2,3,US FDA ?????canagliflozin?dapagliflozin?????????,"Previously, sodium-glucose cotransporter-2 (SG..."
3,2,UNK,Hello. I suffer from congestive heart failure ...
4,3,"Vitamin D improves heart function, study finds",A daily dose of vitamin D3 improves heart func...


In [127]:
sent = preprocess(data, 2)

In [128]:
inp = []
for i in sent:
    dummy = []
    for j in i.split(' '):
        try: 
            if j:
                dummy.append(stoi[j])
        except: 
            if j:
                dummy.append(stoi['unk'])
    inp.append(dummy)

In [129]:
out = []
m = []
with torch.no_grad():
    for i in tqdm(inp):
        input = torch.tensor(i).view(-1,1).to(device)
        h = model.init_hidden(1).to(device)
        try:
            out_ = model(input,h)
            out.append(out_[:,-1,:].item())
        except: out.append(0.01)

100%|██████████| 571/571 [00:10<00:00, 56.05it/s]


In [130]:
len(out)

571

In [131]:
predictions = []
for i in out:
    if i <= 0.25:
        predictions.append(1)
    else:
        predictions.append(0)

# Saving

In [132]:
format = 'Index,Patient_Tag\n'

In [133]:
for index, i in enumerate(predictions,start=1):
    format+=str(index)+','+str(i) + '\n'

In [134]:
with open("submission_7.csv", "w") as f:
    f.write(format.strip())