In [1]:
from tqdm import tqdm
import numpy as np
import random
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import string
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
nltk.download('stopwords')
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
replace /usr/share/nltk_data/corpora/wordnet/lexnames? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(42)
MAX_LENGTH = 30
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [30]:
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [4]:
train= pd.read_csv('../input/nlp-getting-started/train.csv')
test= pd.read_csv("../input/nlp-getting-started/test.csv")

In [5]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

def clean_sentence(sentence):
    # remove URLS 
    sentence = re.sub(r'http\S+', ' ', sentence)
    # remove emoji's
    sentence = remove_emoji(sentence)
    # remove punctuation
    sentence = re.sub("[^0-9A-Za-z ]", "" , sentence)
    # remove double spaces
    sentence = sentence.replace('  ',"")
    
    return sentence.strip()

def remove_stopwords(tokens, stopwords):
    clean_token = [word for word in tokens if word not in stopwords]
    return clean_token

# lemmalization 
def lemmatize(tokens, lemma):
    lemmatized_tokens = [lemma.lemmatize(token, pos = 'v') for token in tokens]
    return lemmatized_tokens

In [6]:
# fix sentence length
def trunc_padding(sentence):
    modify_sentence = sentence.copy()
    if len(modify_sentence) >= MAX_LENGTH:
        modify_sentence = modify_sentence[:MAX_LENGTH]
    else:
        modify_sentence.extend(list(["0"] * (MAX_LENGTH - len(modify_sentence))))
    return modify_sentence

In [7]:
stopwords = nltk.corpus.stopwords.words('english')
lemma = WordNetLemmatizer()

def processing(df, stopwords, lemma):
    df['text'] = df['text'].apply(lambda sentence: sentence.lower())
    df['text'] = df['text'].apply(lambda sentence: clean_sentence(sentence))
    # tokenization
    df['text'] = df['text'].apply(lambda sentence: sentence.split())
    # remove stopwords
    df['text'] = df['text'].apply(lambda sentence: remove_stopwords(sentence, stopwords))
    # lemmalization 
    df['text'] = df['text'].apply(lambda sentence: lemmatize(sentence, lemma))
    # sentence length before padding
    df['length'] = df['text'].apply(lambda x: len(x))
    # fix sentence length
    df['text'] = df['text'].apply(lambda sentence: trunc_padding(sentence))
    # sentence length after padding
    df['length_padding'] = df['text'].apply(lambda x: len(x))
    
processing(train, stopwords, lemma)
processing(test, stopwords, lemma)

In [8]:
train.head()

Unnamed: 0,id,keyword,location,text,target,length,length_padding
0,1,,,"[deeds, reason, earthquake, may, allah, forgiv...",1,7,30
1,4,,,"[forest, fire, near, la, ronge, sask, canada, ...",1,7,30
2,5,,,"[residents, ask, shelter, place, notify, offic...",1,11,30
3,6,,,"[13000, people, receive, wildfires, evacuation...",1,7,30
4,7,,,"[get, send, photo, ruby, alaska, smoke, wildfi...",1,9,30


In [9]:
test.head()

Unnamed: 0,id,keyword,location,text,length,length_padding
0,0,,,"[happen, terrible, car, crash, 0, 0, 0, 0, 0, ...",4,30
1,2,,,"[hear, earthquake, different, cities, stay, sa...",7,30
2,3,,,"[forest, fire, spot, pond, geese, flee, across...",10,30
3,9,,,"[apocalypse, light, spokane, wildfires, 0, 0, ...",4,30
4,11,,,"[typhoon, soudelor, kill, 28, china, taiwan, 0...",6,30


## Assign unique index to each word, used for word embedding

In [10]:
# get all vocabulary
vocab_list = []
for sentence in train['text']:
    vocab_list.append(sentence)
vocab = build_vocab_from_iterator(vocab_list, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

### Dataset and DataLoader

In [11]:
class TweetDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y 
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        sentence = self.x[idx]
        x = torch.tensor(vocab(sentence), dtype=torch.long).to(device)
        y = torch.tensor(self.y[idx], dtype=torch.long).to(device)
        return x, y

In [12]:
model_config = {
    'vocab_size': len(vocab),
    'hidden_dim' : 256,
    'embedding_dim' : 200,
    'num_classes' : 2,
    'n_layers': 2,
    'dropout': 0.2
}

In [13]:
train_data = TweetDataset(train['text'], train['target'])
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)

In [14]:
for batch in train_dataloader:
    print(batch)
    break

[tensor([[  191,  1122,  2990,  ...,     1,     1,     1],
        [    1,   274, 11380,  ...,     1,     1,     1],
        [   54,   518,  4206,  ...,     1,     1,     1],
        ...,
        [ 4056,   605,  1295,  ...,     1,     1,     1],
        [    3,    67,    24,  ...,     1,     1,     1],
        [  292,   483,  3151,  ...,     1,     1,     1]], device='cuda:0'), tensor([1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
        0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0], device='cuda:0')]


# Model

# LSTM

# LSTM + Atention

In [15]:
class LSTM_Attention(nn.Module):
    def __init__(self, config):
        super(LSTM_Attention, self).__init__()
        self.config = config
        self.embedding = nn.Embedding(config['vocab_size'], config['embedding_dim'])
        self.lstm = nn.LSTM(config['embedding_dim'], config['hidden_dim'], bidirectional=True)
        self.fc = nn.Linear(config['hidden_dim'] * 2, config['num_classes'])
        self.sigmoid = nn.Sigmoid()
    
    def attention(self, lstm_output, final_state):
        hidden = final_state.view(-1, self.config['hidden_dim'] * 2, 1)
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)
        soft_attn_weights = F.softmax(attn_weights, 1)
        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return context
    
    def forward(self, X):
        embeds = self.embedding(X).permute(1, 0, 2)
        hidden_state = Variable(torch.zeros(2, len(X), self.config['hidden_dim'])).to(device)
        cell_state = Variable(torch.zeros(2, len(X), self.config['hidden_dim'])).to(device)
        output, (final_hidden_state, final_cell_state) = self.lstm(embeds, (hidden_state, cell_state))
        output = output.permute(1, 0, 2)
        attn_output = self.attention(output, final_hidden_state)
        output = self.fc(attn_output)
        return self.sigmoid(output)

In [16]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    prediction = preds.argmax(dim=1)
    correct = (prediction == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

# Training

In [17]:
model = LSTM_Attention(model_config)
model.to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [18]:
epoch_loss = 0
epoch_acc = 0
for epoch in range(20):
    model.train()
    optimizer.zero_grad()
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        inputs, target = batch
        outputs = model(inputs)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss
        
        acc = binary_accuracy(outputs, target)
        epoch_acc += acc.item()
    
    print("The training loss at epoch {} is {}; The training accuracy is {}".format(epoch, epoch_loss / len(train_dataloader), 
                                                                                    round(epoch_acc/len(train_dataloader), 3)))
    epoch_loss = 0
    epoch_acc = 0

100%|██████████| 119/119 [00:02<00:00, 56.88it/s]


The training loss at epoch 0 is 0.6793618202209473; The training accuracy is 0.583


100%|██████████| 119/119 [00:01<00:00, 78.40it/s]


The training loss at epoch 1 is 0.5985772609710693; The training accuracy is 0.703


100%|██████████| 119/119 [00:01<00:00, 78.36it/s]


The training loss at epoch 2 is 0.532684862613678; The training accuracy is 0.774


100%|██████████| 119/119 [00:01<00:00, 78.52it/s]


The training loss at epoch 3 is 0.4748460352420807; The training accuracy is 0.833


100%|██████████| 119/119 [00:01<00:00, 78.03it/s]


The training loss at epoch 4 is 0.4469597637653351; The training accuracy is 0.865


100%|██████████| 119/119 [00:01<00:00, 78.32it/s]


The training loss at epoch 5 is 0.4257085919380188; The training accuracy is 0.888


100%|██████████| 119/119 [00:01<00:00, 72.35it/s]


The training loss at epoch 6 is 0.41365182399749756; The training accuracy is 0.9


100%|██████████| 119/119 [00:01<00:00, 78.55it/s]


The training loss at epoch 7 is 0.4097965657711029; The training accuracy is 0.903


100%|██████████| 119/119 [00:01<00:00, 78.70it/s]


The training loss at epoch 8 is 0.3984375596046448; The training accuracy is 0.916


100%|██████████| 119/119 [00:01<00:00, 76.88it/s]


The training loss at epoch 9 is 0.39292415976524353; The training accuracy is 0.92


100%|██████████| 119/119 [00:01<00:00, 74.73it/s]


The training loss at epoch 10 is 0.3886856436729431; The training accuracy is 0.924


100%|██████████| 119/119 [00:01<00:00, 77.13it/s]


The training loss at epoch 11 is 0.3904612064361572; The training accuracy is 0.922


100%|██████████| 119/119 [00:01<00:00, 78.47it/s]


The training loss at epoch 12 is 0.4053955674171448; The training accuracy is 0.906


100%|██████████| 119/119 [00:01<00:00, 74.51it/s]


The training loss at epoch 13 is 0.38732877373695374; The training accuracy is 0.925


100%|██████████| 119/119 [00:01<00:00, 78.65it/s]


The training loss at epoch 14 is 0.3787517249584198; The training accuracy is 0.934


100%|██████████| 119/119 [00:01<00:00, 64.24it/s]


The training loss at epoch 15 is 0.3759597837924957; The training accuracy is 0.937


100%|██████████| 119/119 [00:01<00:00, 74.61it/s]


The training loss at epoch 16 is 0.37242498993873596; The training accuracy is 0.941


100%|██████████| 119/119 [00:01<00:00, 77.53it/s]


The training loss at epoch 17 is 0.3702971637248993; The training accuracy is 0.943


100%|██████████| 119/119 [00:01<00:00, 77.83it/s]


The training loss at epoch 18 is 0.3687030076980591; The training accuracy is 0.944


100%|██████████| 119/119 [00:01<00:00, 74.82it/s]

The training loss at epoch 19 is 0.36853986978530884; The training accuracy is 0.945





In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [45]:
def generate_prediction(inputs, model):
    inputs = torch.tensor(vocab(inputs), dtype=torch.long).to(device).unsqueeze(0)
    outpus =  model(inputs).argmax(dim=1).item()
    return outpus

test['target'] = test['text'].apply(lambda sentence: generate_prediction(sentence, model))

In [48]:
submission = test[['id', 'target']]
submission.to_csv('submission.csv', index=False)