In [1]:
#ref - https://www.kaggle.com/marcovasquez/basic-nlp-with-tensorflow-and-wordcloud

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
import re

In [3]:
# Natural Language Tool Kit 
import nltk  
nltk.download('stopwords') 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from IPython.display import clear_output

In [5]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
submission = pd.read_csv('./sample_submission.csv')

In [6]:
class FeedForwardModel(nn.Module):
    def __init__(self, embedding_matrix, embed_dim):
        super().__init__()
        self.embedding = nn.EmbeddingBag.from_pretrained(torch.FloatTensor(embedding_matrix))
        self.fc1 = nn.Linear(embed_dim, 10)
        self.fc2 = nn.Linear(10, 1)
        self.output = nn.Sigmoid()
        self.init_weights()
        
    def forward(self, input, offsets):
        #print(input)
        embedded = self.embedding(input, offsets)
        h1 = F.tanh(self.fc1(embedded))
        h2 = self.fc2(h1)
        return self.output(h2)
    
    def init_weights(self):
        initrange = 0.5
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.bias.data.zero_()

In [7]:
class LSTMAndFeedForward(nn.Module):
    def __init__(self, embed_weight, embed_dim, hidden_dim, num_layer):
        super().__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layer = num_layer
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embed_weight))
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layer, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 10)
        self.fc2 = nn.Linear(10, 1)
        self.output = nn.Sigmoid()
        self.init_weights()
        
    def forward(self, input, offsets):
        emb = self.embedding(input)
        out, (hn, cn) = self.lstm(emb)
        h1 = F.tanh(self.fc1(hn[-1]))
        h2 = self.fc2(h1)
        return self.output(h2)
    
    def init_weights(self):
        initrange = 0.5
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.bias.data.zero_()
        
        

In [8]:
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def remove_html(text):
    no_html= pattern.sub('',text)
    return no_html

In [9]:
# Remove all text that start with html
train['text']=train['text'].apply(lambda x : remove_html(x))

In [10]:
# Remove all text that start with html in test
test['text']=test['text'].apply(lambda x : remove_html(x))

In [11]:
def clean_text(dataset):
    corpus = []  
    for i in range(0, len(dataset)):  
        text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])  
        text = text.lower()  
        # split to array(default delimiter is " ") 
        text = text.split()  
        text = ' '.join(text)    
        corpus.append(text)  
        
    return corpus

In [12]:
train['text'] = clean_text(train)
test['text'] = clean_text(test)

In [13]:
# this is base in 80% of the data, an only text and targert at this moment
training_size = 6090
training_sentences = train.text.values
training_labels = train.target.values
testing_sentences = test.text.values

In [14]:
embeddings_dict = {}
embedding_matrix = []
embedding_idx_word = {}
embedding_dim = 50

embeddings_dict['unk'] = 0
embedding_matrix.append(np.zeros(embedding_dim))
embedding_idx_word[0] = 'unk'

file = './glove.twitter.27B/glove.twitter.27B.50d.txt'
with open(file, 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        if (len(vector) != embedding_dim):
            continue
        embedding_idx_word[len(embedding_idx_word)] = word
        embeddings_dict[word] = len(embeddings_dict)
        embedding_matrix.append(vector)

In [15]:
class TwitterDataSet(Dataset):    
    def __init__(self, input, labels, vocab, isFixed, size):
        self.vocab = vocab
        self.data = []
        print(labels[0:22])
        for i in range(len(input)):
            tokens = input[i].split()
            #print(tokens)
            if isFixed:
                ttokens = np.zeros(size, dtype = int)
                for j in range(min(size, len(tokens))):
                    #print(tokens[i])
                    if tokens[j] in self.vocab:
                        ttokens[j] = self.vocab[tokens[j]]
                        #print(ttokens[i])
                    else:
                        ttokens[j] = self.vocab['unk']
                    #print(ttokens)
                tokens = ttokens
            else:
                tokens = [self.vocab[token] if token in self.vocab else self.vocab['unk'] for token in tokens]
            self.data.append((labels[i], tokens))
            #print(tokens)
            #break
        self.labels = labels
        #print(self.data)
        
    def __getitem__(self, index):
        return self.data[index]
    
    def __getlabel__(self, index):
        return this.labels[index]
    
                
    
    def __len__(self):
        return len(self.labels)
    
    def getvocab(self):
        return self.vocab
    
    def getTensor(self, sentence):
        tokens = sentence.split()
        tokens = torch.tensor([[self.vocab[token] if token in self.vocab else self.vocab['unk'] for token in tokens]])
        return tokens
        

In [16]:
def generate_batch(batch):
    label = torch.tensor([entry[0] * 1.0 for entry in batch])
    text = torch.tensor([entry[1] for entry in batch])
    offsets = [0] + [len(entry) for entry in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = text.type(torch.int64)
    return text, offsets, label

In [17]:
N_EPOCHS = 10
BATCH_SIZE = 32

In [18]:
hidden_dim = 32
number_layer = 32
lstm_dataset = TwitterDataSet(train.text.values, train.target.values, embeddings_dict, True, number_layer)
lstModel = LSTMAndFeedForward(embedding_matrix, embedding_dim, hidden_dim, number_layer)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]


In [19]:
#twitter_dateSet = TwitterDataSet(train.text.values, train.target.values, embeddings_dict, False, 0)
#ffmodel = FeedForwardModel(embedding_matrix, embedding_dim)

In [20]:
loss_fuction = nn.BCELoss()
optimizer = torch.optim.SGD(lstModel.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [21]:
def train_func(sub_train_, model):
    # Train the model
    train_loss = 0.0
    train_acc = 0.0
    count = 0.0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        output = model(text, offsets)
        loss = loss_fuction(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        outputLabel = (output > 0.5).float()
        outputLabel = outputLabel.view(-1)
        train_acc += ((outputLabel == cls).float()).sum()
        count = count + len(cls)
        #print('outputLength : %d train_acc %f count %f acc %f' %(len(outputLabel), train_acc, count, train_acc/count))
        #print('%f' %(train_acc/count))
        #print(train_acc)
        #print(train_acc)
    # Adjust the learning rate
    scheduler.step()

    return train_loss/count, train_acc/count


In [22]:
def valid_func(sub_valid_, model):
    valid_loss = 0.0
    valid_acc = 0.0
    count = 0.0
    data = DataLoader(sub_valid_, batch_size=BATCH_SIZE, shuffle=True,collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        output = model(text, offsets)
        loss = loss_fuction(output, cls)
        valid_loss += loss.item()
        outputLabel = (output > 0.5).float()
        outputLabel = outputLabel.view(-1)
        valid_acc += ((outputLabel == cls).float()).sum()
        count = count + len(cls)
    return valid_loss/count, valid_acc/count

In [23]:
import time
from torch.utils.data.dataset import random_split

train_len = int(len(lstm_dataset) * 0.20)
sub_train_, sub_valid_ = random_split(lstm_dataset, [train_len, len(lstm_dataset) - train_len])

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_, lstModel)
    valid_loss, valid_acc = valid_func(sub_valid_, lstModel)
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60
    if (epoch % 2) == 0:
        print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
        print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
        print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch: 1  | time in 1 minutes, 23 seconds
	Loss: 0.1090(train)	|	Acc: 49.3%(train)
	Loss: 0.0977(valid)	|	Acc: 42.8%(valid)
Epoch: 3  | time in 1 minutes, 23 seconds
	Loss: 0.1241(train)	|	Acc: 49.5%(train)
	Loss: 0.1364(valid)	|	Acc: 57.2%(valid)
Epoch: 5  | time in 1 minutes, 22 seconds
	Loss: 0.0899(train)	|	Acc: 53.2%(train)
	Loss: 0.0956(valid)	|	Acc: 57.2%(valid)
Epoch: 7  | time in 1 minutes, 23 seconds
	Loss: 0.0854(train)	|	Acc: 50.0%(train)
	Loss: 0.0346(valid)	|	Acc: 57.2%(valid)
Epoch: 9  | time in 1 minutes, 21 seconds
	Loss: 0.0625(train)	|	Acc: 51.1%(train)
	Loss: 0.0321(valid)	|	Acc: 42.8%(valid)


In [24]:
def predict_fun(sentence):
    #print(" input " + sentence)
    if len(sentence) == 0:
        sentence = 'awesome'
    tensor = lstm_dataset.getTensor(sentence)
    t1 = lstModel(tensor, torch.tensor([0]))
    t1 = t1.view(-1)
    return t1[0].item()

In [25]:
train["modelPredict"] = train['text'].apply(lambda x : predict_fun(x))

In [26]:
train.to_csv("modelDebugging.csv")

In [27]:
test['target'] = test['text'].apply(lambda x : predict_fun(x))

In [28]:
submission['target'] = (test['target'] > 0.5).astype(int)

In [29]:
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [30]:
submission.to_csv("submission.csv", index=False, header=True)

In [31]:
test[0:]

Unnamed: 0,id,keyword,location,text,target
0,0,,,just happened a terrible car crash,0.917844
1,2,,,heard about earthquake is different cities sta...,0.917802
2,3,,,there is a forest fire at spot pond geese are ...,0.917791
3,9,,,apocalypse lighting spokane wildfires,0.917981
4,11,,,typhoon soudelor kills in china and taiwan,0.917821
...,...,...,...,...,...
3258,10861,,,earthquake safety los angeles safety fasteners...,0.917821
3259,10865,,,storm in ri worse than last hurricane my city ...,0.917791
3260,10868,,,green line derailment in chicago,0.917888
3261,10874,,,meg issues hazardous weather outlook hwo,0.917844
