In [None]:
#ref - https://www.kaggle.com/marcovasquez/basic-nlp-with-tensorflow-and-wordcloud

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
import re

In [2]:
# Natural Language Tool Kit 
import nltk  
nltk.download('stopwords') 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajku\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
submission = pd.read_csv('./sample_submission.csv')

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def remove_html(text):
    no_html= pattern.sub('',text)
    return no_html

In [6]:
# Remove all text that start with html
train['text']=train['text'].apply(lambda x : remove_html(x))
test['text']=test['text'].apply(lambda x : remove_html(x))
train.loc[train['text'].str.contains('http')].target.value_counts()

0    1
Name: target, dtype: int64

In [12]:
def clean_text(dataset):
    corpus = []  
    for i in range(0, len(dataset)):  
        text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])  
        text = text.lower()  
        # split to array(default delimiter is " ") 
        text = text.split()
        cleantext = []
        for t in text:
            if (len(text)) <= 2:
                continue
            cleantext.append(t)
        text = ' '.join(cleantext)    
        corpus.append(text)  
        
    return corpus

In [46]:
train['text'] = clean_text(train)
test['text'] = clean_text(test)

In [48]:
train = train[len(train['text']) > 0]
test = test[len(test['text']) > 0]

KeyError: True

In [51]:
train

KeyError: True

In [14]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train.text.values)
idf = vectorizer.idf_
idf_vocab = vectorizer.vocabulary_

In [15]:
train.text.values

array(['our deeds are the reason of this earthquake may allah forgive us all',
       'forest fire near la ronge sask canada',
       'all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected',
       ..., 'm utc km s of volcano hawaii',
       'police investigating after an e bike collided with a car in little portugal e bike rider suffered serious non life threatening injuries',
       'the latest more homes razed by northern california wildfire abc news'],
      dtype=object)

In [16]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [17]:
embedding_dim = 25

In [18]:
model = Word2Vec([x.split() for x in train.text.values], min_count=1, size=embedding_dim)

In [19]:
model.wv['fire']

array([-3.3970828 ,  0.38387552,  0.5734416 ,  0.17056273,  0.6775423 ,
       -0.6026293 ,  0.33637473,  1.4864889 , -2.1781757 ,  0.95379776,
       -0.78529286, -0.86143005, -3.1887312 , -0.47411925, -0.10404182,
       -1.3211287 , -1.9210428 ,  0.319292  ,  0.26214236,  3.220421  ,
       -1.9736892 , -0.61150783,  0.5784559 ,  0.26379704, -0.60832703],
      dtype=float32)

In [20]:
model.wv[model.wv.vocab]

array([[-2.21071720e+00,  2.60898083e-01,  3.72877866e-01, ...,
         3.41873437e-01,  1.82068110e-01, -4.26021188e-01],
       [-3.90440114e-02,  8.47781356e-03,  3.31667485e-03, ...,
        -4.62460332e-04,  5.49084926e-03,  1.09213050e-02],
       [-3.80412698e+00,  5.28449774e-01,  5.73879004e-01, ...,
         4.61733639e-01,  3.36226344e-01, -6.95806146e-01],
       ...,
       [-2.32012607e-02, -4.11880529e-03,  2.03537159e-02, ...,
        -1.56162055e-02,  4.64843307e-03, -3.99249082e-04],
       [-2.39132028e-02,  1.21059758e-03, -7.33341929e-03, ...,
         1.48042040e-02,  1.10401539e-02, -1.51642901e-03],
       [-3.64758968e-02,  1.65267233e-02,  3.30372131e-03, ...,
         1.01493904e-02, -3.83380614e-03,  1.35103799e-02]], dtype=float32)

In [21]:
#model.wv.vocab

In [22]:
embeddings_dict = {}
i = 1
embedding = []
embeddings_dict['unk'] = 0
word_idx = {}
word_idx[0] = 'unk'
embedding.append(np.zeros(embedding_dim, dtype=int))
for key, _ in model.wv.vocab.items():
    embeddings_dict[key] = i
    word_idx[i] = key
    i += 1
    embedding.append(model.wv[key])

In [23]:
for key, value in idf_vocab.items():
    model.wv[key] = model.wv[key] * idf[value]

In [24]:
idf[idf_vocab['this']]

3.802179045860704

In [25]:
model.wv['fire']

array([-15.24171   ,   1.7223363 ,   2.5728636 ,   0.7652647 ,
         3.0399327 ,  -2.70382   ,   1.5092143 ,   6.669438  ,
        -9.772833  ,   4.2794094 ,  -3.5233777 ,  -3.8649828 ,
       -14.306897  ,  -2.1272333 ,  -0.46680498,  -5.927515  ,
        -8.619153  ,   1.4325691 ,   1.1761556 ,  14.44908   ,
        -8.855362  ,  -2.7436554 ,   2.5953612 ,   1.1835796 ,
        -2.7293842 ], dtype=float32)

In [26]:
model.wv.most_similar('fire')

[('emergency', 0.9998307824134827),
 ('new', 0.9998186826705933),
 ('on', 0.9997924566268921),
 ('people', 0.9997823238372803),
 ('destruction', 0.9997754096984863),
 ('the', 0.999773383140564),
 ('news', 0.9997650384902954),
 ('at', 0.999754786491394),
 ('from', 0.9997519254684448),
 ('crash', 0.99974524974823)]

In [27]:
class FeedForwardModel(nn.Module):
    def __init__(self, embedding_matrix, embed_dim):
        super().__init__()
        self.embedding = nn.EmbeddingBag.from_pretrained(torch.FloatTensor(embedding_matrix))
        self.fc1 = nn.Linear(embed_dim, 10)
        self.fc2 = nn.Linear(10, 1)
        self.output = nn.Sigmoid()
        self.init_weights()
        
    def __init__(self, num_embeddings, embed_dim):
        super().__init__()
        self.embedding = nn.EmbeddingBag(num_embeddings, embedding_dim)
        self.fc1 = nn.Linear(embed_dim, 10)
        self.fc2 = nn.Linear(10, 1)
        self.output = nn.Sigmoid()
        self.embedding.weight.data.uniform_(-0.5, 0.5)
        self.embedding.weight.requires_grad = True
        self.init_weights()
                  
        
    def forward(self, input, offsets):
        #print(input)
        #print(self.embedding.weight[1:3])
        embedded = self.embedding(input, offsets)
        h1 = F.tanh(self.fc1(embedded))
        h2 = self.fc2(h1)
        return self.output(h2)
    
    def init_weights(self):
        initrange = 0.5
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.bias.data.zero_()

In [28]:
class TwitterDataSet(Dataset):    
    def __init__(self, input, labels, vocab, isFixed, size):
        self.vocab = vocab
        self.data = []
        for i in range(len(input)):
            line = input[i]
            tokens = [self.vocab[token] if token in self.vocab else self.vocab['unk'] for token in line]
            self.data.append((labels[i], tokens))
        self.labels = labels
        
    def __getitem__(self, index):
        return self.data[index]
    
    def __getlabel__(self, index):
        return this.labels[index]
    
                
    
    def __len__(self):
        return len(self.labels)
    
    def getvocab(self):
        return self.vocab
    
    def getTensor(self, sentence):
        tokens = sentence.split()
        tokens = torch.tensor([[self.vocab[token] if token in self.vocab else self.vocab['unk'] for token in tokens]])
        return tokens

In [42]:
def generate_batch(batch):
    label = torch.tensor([entry[0] * 1.0 for entry in batch])
    text = [torch.tensor(entry[1]) for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    print(text)
    text = torch.cat(text)
    return text, offsets, label

In [30]:
twitter_dateSet = TwitterDataSet(train.text.values, train.target.values, embeddings_dict, False, 0)


In [31]:
#ffmodel = FeedForwardModel(embedding, embedding_dim)
ffmodel = FeedForwardModel(len(twitter_dateSet.vocab), embedding_dim)

In [32]:
loss_fuction = nn.BCELoss()
optimizer = torch.optim.SGD(ffmodel.parameters(), lr=2.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [33]:
twitter_dateSet.vocab['fire']

15

In [34]:
ffmodel.embedding.weight[15]

tensor([ 0.1566,  0.0740,  0.2386, -0.3681, -0.3893, -0.2133, -0.3776, -0.3680,
         0.3299,  0.3512,  0.4521,  0.2492,  0.1608,  0.0850,  0.1175,  0.2592,
        -0.2391,  0.4796, -0.4645,  0.2887,  0.1292,  0.1851,  0.1179,  0.4575,
        -0.1870], grad_fn=<SelectBackward>)

In [35]:
#ffmodel.embedding.weight[11:15]

In [36]:
#ffmodel.embedding.weight[11:15]

In [37]:
#ffmodel.fc1.weight

In [38]:
#After training
#ffmodel.embedding.weight[15]

In [43]:
def train_func(sub_train_):

    # Train the model
    train_loss = 0.0
    train_acc = 0.0
    count = 0.0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        output = ffmodel(text, offsets)
        loss = loss_fuction(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        outputLabel = (output > 0.5).float()
        outputLabel = outputLabel.view(-1)
        train_acc += ((outputLabel == cls).float()).sum()
        count = count + len(cls)
        #print('outputLength : %d train_acc %f count %f acc %f' %(len(outputLabel), train_acc, count, train_acc/count))
        #print('%f' %(train_acc/count))
        #print(train_acc)
        #print(train_acc)
    # Adjust the learning rate
    scheduler.step()

    return train_loss/count, train_acc/count


In [44]:
def valid_func(sub_valid_):
    valid_loss = 0.0
    valid_acc = 0.0
    count = 0.0
    data = DataLoader(sub_valid_, batch_size=BATCH_SIZE, shuffle=True,collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        output = ffmodel(text, offsets)
        loss = loss_fuction(output, cls)
        valid_loss += loss.item()
        outputLabel = (output > 0.5).float()
        outputLabel = outputLabel.view(-1)
        valid_acc += ((outputLabel == cls).float()).sum()
        count = count + len(cls)
    return valid_loss/count, valid_acc/count

In [45]:
import time
from torch.utils.data.dataset import random_split
BATCH_SIZE = 32
train_len = int(len(twitter_dateSet) * 0.90)
sub_train_, sub_valid_ = random_split(twitter_dateSet, [train_len, len(twitter_dateSet) - train_len])

for epoch in range(2):
    start_time = time.time()
    currentWeight = ffmodel.embedding.weight.clone()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = valid_func(sub_valid_)
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60
    
    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
    
    for i in range(len(ffmodel.embedding.weight)):
        diff = torch.abs(currentWeight[i] - ffmodel.embedding.weight[i]) > 0.001
        if (diff.any()):
            print(currentWeight[i])
            print(ffmodel.embedding.weight[i])
            print(i)
    print(diff)


[tensor([ 152, 1087, 2715,    0,  887,   76,  295, 1534,    0,  152, 1087,   52,
         152,    0,   76,    0,  413,   52,  469,  469,   52,    0,   86,   52,
        2185, 2715,    0,   76,   86,    0, 1534,   76,  376, 2715,    0,   52,
           0,  220,   52,  469,  887, 2715,  295,    0,  152,  789,    0,   77,
        2728,    0, 1087, 2715,   52, 1534,  152, 1087,    0,  152,  295, 2728,
           0, 1535, 2715,   76,  469,  887,    0,  413,   76,  152, 1087,    0,
          86,  789,   77, 2715, 1535,  789,  220, 2728,    0,  152, 1087,   52,
         152,    0,  413,   52,  469,  469,   52,    0, 1535, 2715,    0,   86,
         789,   77, 2715, 1535,  789,  220, 2728,    0, 2715, 1534,   86, 2715]), tensor([ 152,  789,    0,  469,   52, 2185,   76,  887,   52,  152, 2715,    0,
          76,  469, 2715, 2185,   76,  152,   52, 1535, 1534, 2715,    0,  764,
         763, 1087, 2715,   52, 2185,   52, 1534,    0,   76,  469,  152, 2715,
         295,  469,   52, 1534,    0,

RuntimeError: Expected object of scalar type Long but got scalar type Float for sequence element 6 in sequence argument at position #1 'tensors'

In [None]:
def predict_fun(sentence):
    #print(" input " + sentence)
    if len(sentence) == 0:
        sentence = 'awesome'
    tensor = twitter_dateSet.getTensor(sentence)
    t1 = model(tensor, torch.tensor([0]))
    t1 = t1.view(-1)
    return t1[0].item()

In [None]:
word_idx[473]

In [None]:
t1 = twitter_dateSet.getTensor('this is fire')

In [None]:
train["modelPredict"] = train['text'].apply(lambda x : predict_fun(x))

In [None]:
predict_fun('this is fire')

In [None]:
train.to_csv("modelDebugging.csv")

In [None]:
test['target'] = test['text'].apply(lambda x : predict_fun(x))

In [None]:
submission['target'] = (test['target'] > 0.5).astype(int)

In [None]:
submission

In [None]:
submission.to_csv("submission.csv", index=False, header=True)

In [None]:
test[0:]

In [None]:
t1[0].item()

In [None]:
print('%f'%(2.2/1.9))

In [None]:
t1 = torch.tensor([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8])
t2 = torch.tensor([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.9])

In [None]:
(t1 == t2).sum().item()

In [None]:
t2.float()