In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/train.csv


In [3]:
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
train_data.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [4]:
# remove all NaN 
train_data = train_data.fillna('a')
test_data = test_data.fillna('a')

### Convert each tweet to words

A method to remove all html tags, and tokenize the tweet. Remove stop words

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def tweet_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [6]:
# testing on one tweet
tweet_to_words(train_data.text[10])

['three', 'peopl', 'die', 'heat', 'wave', 'far']

In [7]:
# Apply it on all tweets and keywords

words_train = [tweet_to_words(tweet) for tweet in train_data.text]
words_test = [tweet_to_words(tweet) for tweet in test_data.text]

keywords_train = [tweet_to_words(tweet) for tweet in train_data.keyword]
keywords_test = [tweet_to_words(tweet) for tweet in test_data.keyword]

### Create a word dictionary


In [8]:
#checking any one tweet
words_train[10]

['three', 'peopl', 'die', 'heat', 'wave', 'far']

In [9]:
# Get all the vocab from all the tweets
# list the top 500 frequent words. with the count of occurance

def build_dict(data, vocab_size = 500):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    # sentence is a list of words.
    '''
    Creating a flat list - 
    for sublist in words_train:
        for item in sublist:
            flat_list.append(item)
        
    '''
    flat_list = [item for sublist in data for item in sublist]
    word_value, frequency = np.unique(flat_list, return_counts=True)
    word_count = dict(zip(word_value, frequency)) # A dict storing the words that appear in the reviews along with how often they occur
    
    # TODO: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    #       sorted_words[-1] is the least frequently appearing word.
    
    
    sorted_list = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
    sorted_words = [item[0] for item in sorted_list]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [10]:
word_dict = build_dict(words_train)
print(word_dict)

{'co': 2, 'http': 3, 'like': 4, 'fire': 5, 'get': 6, 'bomb': 7, 'new': 8, 'via': 9, '2': 10, 'one': 11, 'go': 12, 'peopl': 13, 'news': 14, 'kill': 15, 'burn': 16, 'year': 17, 'video': 18, 'flood': 19, 'time': 20, 'crash': 21, 'emerg': 22, 'disast': 23, 'bodi': 24, 'attack': 25, 'build': 26, 'day': 27, 'fatal': 28, 'look': 29, 'say': 30, 'home': 31, 'love': 32, 'polic': 33, 'would': 34, '3': 35, 'u': 36, 'make': 37, 'famili': 38, 'evacu': 39, 'still': 40, 'storm': 41, 'train': 42, 'see': 43, 'us': 44, 'come': 45, 'back': 46, 'know': 47, 'california': 48, 'suicid': 49, '1': 50, 'bag': 51, 'live': 52, 'watch': 53, 'want': 54, 'collaps': 55, 'man': 56, 'world': 57, 'car': 58, 'death': 59, 'derail': 60, 'scream': 61, 'got': 62, 'rt': 63, 'first': 64, 'take': 65, 'caus': 66, 'let': 67, 'think': 68, 'nuclear': 69, 'two': 70, 'drown': 71, 'today': 72, 'war': 73, 'need': 74, 'work': 75, 'accid': 76, 'dead': 77, 'wreck': 78, 'deton': 79, 'youtub': 80, 'destroy': 81, '4': 82, '5': 83, 'hijack': 8

In [11]:
# Transform all tweets with each word to an integer corresponding to its rank in the words_dict
# pad is 280 as the max word limit for each 

def convert_and_pad(word_dict, sentence, pad=280):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=280):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [12]:
train_X, train_X_len = convert_and_pad_data(word_dict, words_train, 100)
test_X, test_X_len = convert_and_pad_data(word_dict, words_test, 100)
keyword_train_rank, dummy = convert_and_pad_data(word_dict,keywords_train,1)
keyword_test_rank, dummy = convert_and_pad_data(word_dict,keywords_test,1)

In [13]:
print(train_X[1])
print(train_X_len[1])

[152   5 201   1   1   1   1   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]
7


In [14]:
# train_data['word_ranks'] = train_X
# train_data['tweet_size'] = train_X_len

train_df = pd.concat([pd.DataFrame(train_X_len), pd.DataFrame(keyword_train_rank), pd.DataFrame(train_X)], axis=1)
train_df

Unnamed: 0,0,0.1,0.2,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,7,0,1,1,231,98,1,1,44,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,152,5,201,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,11,0,1,1,1,432,1,235,39,1,...,0,0,0,0,0,0,0,0,0,0
3,8,0,1,1,13,1,113,39,337,48,...,0,0,0,0,0,0,0,0,0,0
4,9,0,62,1,148,1,1,227,113,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,11,0,70,1,1,1,320,55,1,31,...,0,0,0,0,0,0,0,0,0,0
7609,12,0,1,1,1,1,256,5,48,103,...,0,0,0,0,0,0,0,0,0,0
7610,11,0,1,1,1,1,1,1,441,1,...,0,0,0,0,0,0,0,0,0,0
7611,17,0,33,183,487,1,126,58,253,1,...,0,0,0,0,0,0,0,0,0,0


In [15]:
train_label = train_data.target
train_label.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [16]:
import torch
import torch.utils.data

# Turn the input pandas dataframe into tensors
train_y = torch.from_numpy(train_label.values).float().squeeze()
train_X = torch.from_numpy(train_df.values).long()

# Build the dataset
train_ds = torch.utils.data.TensorDataset(train_X, train_y)
# Build the dataloader
train_dl = torch.utils.data.DataLoader(train_ds)v

In [17]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    """
    This is the simple RNN model to check for real disaster
    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        """
        Initialize the model by settingg up the various layers.
        """
        super(LSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.dense = nn.Linear(in_features=hidden_dim, out_features=1)
        self.sig = nn.Sigmoid()
        
        self.word_dict = None

    def forward(self, x):
        """
        Perform a forward pass of our model on some input.
        """
        x = x.t()
        lengths = x[0,:]
        reviews = x[1:,:]
        embeds = self.embedding(reviews)
        lstm_out, _ = self.lstm(embeds)
        out = self.dense(lstm_out)
        out = out[lengths - 1, range(len(lengths))]
        return self.sig(out.squeeze())

In [18]:
def train(model, train_loader, epochs, optimizer, loss_fn, device):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:         
            batch_X, batch_y = batch
            
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            optimizer.zero_grad()
            
            output = model.forward(batch_X)
            
            loss = loss_fn(output, batch_y)
            loss.backward()
            
            optimizer.step()
            
            total_loss += loss.data.item()
        print("Epoch: {}, BCELoss: {}".format(epoch, total_loss / len(train_loader)))

In [19]:
import torch.optim as optim
# from train.model import LSTMClassifier

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(102, 100, 500).to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

train(model, train_dl, 5, optimizer, loss_fn, device)

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


Epoch: 1, BCELoss: 0.5794119907909573
Epoch: 2, BCELoss: 0.4698074878912207
Epoch: 3, BCELoss: 0.39927378615749887
Epoch: 4, BCELoss: 0.3328204752278715
Epoch: 5, BCELoss: 0.2641004881464704


In [25]:
test_df = pd.concat([pd.DataFrame(test_X_len), pd.DataFrame(keyword_test_rank), pd.DataFrame(test_X)], axis=1)


# Turn the input pandas dataframe into tensors

test_X = torch.from_numpy(test_df.values).long()

# Build the dataset
test_ds = torch.utils.data.TensorDataset(test_X)
# Build the dataloader
test_dl = torch.utils.data.DataLoader(train_ds)

for batch in test_dl:
    prediction = model(batch)
    print(prediction)
# train_ds

AttributeError: 'DataLoader' object has no attribute 't'