In [15]:
import pandas as pd
import numpy as np

In [16]:
train_data = pd.read_csv('train.csv')
train_data.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [17]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [18]:
for i in range(50,60):
    print(train_data.text.iloc[i])
    print(train_data.target.iloc[i])

Deputies: Man shot before Brighton home set ablaze http://t.co/gWNRhMSO8k
1
Man wife get six years jail for setting ablaze niece
http://t.co/eV1ahOUCZA
1
SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintendent Lanford Salmon has r ... - http://t.co/vplR5Hka2u http://t.co/SxHW2TNNLf
0
Police: Arsonist Deliberately Set Black Church In North CarolinaåÊAblaze http://t.co/pcXarbH9An
1
Noches El-Bestia '@Alexis_Sanchez: happy to see my teammates and training hard ?? goodnight gunners.?????? http://t.co/uc4j4jHvGR'
0
#Kurds trampling on Turkmen flag later set it ablaze while others vandalized offices of Turkmen Front in #Diyala http://t.co/4IzFdYC3cg
1
TRUCK ABLAZE : R21. VOORTREKKER AVE. OUTSIDE OR TAMBO INTL. CARGO SECTION. http://t.co/8kscqKfKkF
1
Set our hearts ablaze and every city was a gift And every skyline was like a kiss upon the lips @Û_ https://t.co/cYoMPZ1A0Z
0
They sky was ablaze tonight in Los Angeles. I'm expecting IG and FB to be filled with sunset shots if I know my p

Some very basic cleanup work to start.  Mostly just set all URLs and @terms to respective keywords

In [19]:
from string import punctuation
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stopWords = set(stopwords.words('english'))

def clean_text(text):
    # change URL/@ tags to keywords
    text = ' '.join(['URL' if 'http://' in i or 'https://' in i else i for i in text.split(' ') ])
    text = ' '.join(['ATUSER' if len(i)>0 and i[0]=='@' else i for i in text.split(' ') ])
    #remove punctuation
    text = ''.join([c for c in text if c not in punctuation])
    #replace newline with space
    text = re.sub('\n',' ',text)
    #remove extra spaces
    text = re.sub(' +', ' ', text)
    #remove stop words
    text = ' '.join([i for i in text.lower().split(' ') if i not in stopWords])
    return text

text = [clean_text(i) for i in train_data.text]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\duran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
#inspect text
text

['deeds reason earthquake may allah forgive us',
 'forest fire near la ronge sask canada',
 'residents asked shelter place notified officers evacuation shelter place orders expected',
 '13000 people receive wildfires evacuation orders california ',
 'got sent photo ruby alaska smoke wildfires pours school ',
 'rockyfire update california hwy 20 closed directions due lake county fire cafire wildfires',
 'flood disaster heavy rain causes flash flooding streets manitou colorado springs areas',
 'im top hill see fire woods',
 'theres emergency evacuation happening building across street',
 'im afraid tornado coming area',
 'three people died heat wave far',
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding',
 'raining flooding florida tampabay tampa 18 19 days ive lost count ',
 'flood bago myanmar arrived bago',
 'damage school bus 80 multi car crash breaking ',
 'whats man',
 'love fruits',
 'summer lovely',
 'car fast',
 'goooooooaaaaaal',
 'r

In [21]:
all_words = ' '.join(text).split(' ')

In [22]:
# feel free to use this import 
from collections import Counter

## Build a dictionary that maps words to integers
counts = Counter(all_words)
vocab = sorted(counts,key=counts.get, reverse=True)
vocab_to_int = {word:i for i,word in enumerate(vocab,1)}

## use the dict to tokenize each review in reviews_split
## store the tokenized reviews in reviews_ints
tweet_ints = []
for tweet in text:
    tweet_ints.append([vocab_to_int[word] for word in tweet.split(' ')])

In [23]:
#how big is vocab?
len(counts.keys())

15817

In [24]:
vocab[0:20]

['url',
 'atuser',
 '',
 'like',
 'im',
 'amp',
 'fire',
 'get',
 'new',
 'via',
 'people',
 'one',
 'dont',
 'news',
 'video',
 'us',
 '2',
 'emergency',
 'disaster',
 'police']

In [38]:
import spacy
# Load the spacy model that you have installed
nlp = spacy.load('en_core_web_md')

#run vocab through spacy
doc = nlp(' '.join(vocab))

#use .vector to obtain vector of dim 300 for each word (if not in spacy vocab, use all zeros)
embedding_matrix = np.zeros((len(vocab)+1,300))
for index in range(len(vocab)):
    embedding_matrix[index+1] = doc[index].vector

spacy_embedding_dim = embedding_matrix.shape

(15818, 300)

In [11]:
# outlier review stats
tweet_lens = Counter([len(x) for x in tweet_ints])
print("Zero-length reviews: {}".format(tweet_lens[0]))
print("Maximum review length: {}".format(max(tweet_lens)))

Zero-length reviews: 0
Maximum review length: 25


In [12]:
# pad tweets to equal legnths
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

seq_length = 31

features = pad_features(tweet_ints, seq_length=seq_length)

In [13]:
split_frac = 0.8

## split data into training, validation data (features and labels, x and y)

split_idx = int(len(features)*split_frac)
train_x, test_x = features[:split_idx], features[split_idx:]
train_y, test_y = train_data.target.tolist()[:split_idx], train_data.target.tolist()[split_idx:]

print(train_x[0])
print(train_y[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0 4305  715  153   56
 1567 4306   16]
1


In [14]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(np.array(train_x)), torch.from_numpy(np.array(train_y)))
test_data = TensorDataset(torch.from_numpy(np.array(test_x)), torch.from_numpy(np.array(test_y)))

# dataloaders
batch_size = 64

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size,drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size,drop_last=True)

In [15]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([64, 31])
Sample input: 
 tensor([[    0,     0,     0,  ...,     6,  3052,     1],
        [    0,     0,     0,  ...,  9705,  9706,  9707],
        [    0,     0,     0,  ...,  2824,   958,   350],
        ...,
        [    0,     0,     0,  ...,   248,   304,     1],
        [    0,     0,     0,  ...,  1291,   284,  2164],
        [    0,     0,     0,  ...,   287, 12549,     1]], dtype=torch.int32)

Sample label size:  torch.Size([64])
Sample label: 
 tensor([1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
        1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
        1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1], dtype=torch.int32)


In [16]:
train_on_gpu=torch.cuda.is_available()

In [18]:
import torch.nn as nn

class ClassifierRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(ClassifierRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True,bidirectional=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.5)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
      
    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        #batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)

        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers*2, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [19]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding + our word tokens
output_size = 1
embedding_dim = spacy_embedding_dim[1]
hidden_dim = 512
n_layers = 3

net = ClassifierRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

ClassifierRNN(
  (embedding): Embedding(15818, 800)
  (lstm): LSTM(800, 512, num_layers=3, batch_first=True, dropout=0.5, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [20]:
# loss and optimization functions
lr=0.00001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


In [21]:
# training params

epochs = 20

counter = 0
print_every = 100
clip=1 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()
        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in test_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            if val_losses[-1] < 0.55:
                break
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 2/20... Step: 100... Loss: 0.702789... Val Loss: 0.687354
Epoch: 3/20... Step: 200... Loss: 0.666566... Val Loss: 0.684619
Epoch: 4/20... Step: 300... Loss: 0.687817... Val Loss: 0.677594
Epoch: 5/20... Step: 400... Loss: 0.601820... Val Loss: 0.649357
Epoch: 6/20... Step: 500... Loss: 0.654787... Val Loss: 0.643117
Epoch: 7/20... Step: 600... Loss: 0.618338... Val Loss: 0.643223
Epoch: 8/20... Step: 700... Loss: 0.687192... Val Loss: 0.623717
Epoch: 9/20... Step: 800... Loss: 0.612813... Val Loss: 0.622212
Epoch: 10/20... Step: 900... Loss: 0.582900... Val Loss: 0.632042
Epoch: 11/20... Step: 1000... Loss: 0.454386... Val Loss: 0.602340
Epoch: 18/20... Step: 1400... Loss: 0.417614... Val Loss: 0.565657
Epoch: 19/20... Step: 1500... Loss: 0.364351... Val Loss: 0.547358
Epoch: 20/20... Step: 1600... Loss: 0.304584... Val Loss: 0.561751


In [22]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.565
Test accuracy: 0.738
