In [24]:
import pandas as pd
import numpy as np
from random import shuffle
import random
import matplotlib.pyplot as plt
#import seaborn as sns
#from wordcloud import WordCloud, STOPWORDS 
import re
import os
from tqdm.notebook import tqdm
from collections import Counter
import warnings
import nltk
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
import pickle

In [25]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

In [26]:
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('punkt_tab')

In [27]:
#!pip install nltk==3.8.1 > /dev/null

In [28]:
file_name = 'data/train.csv'
size_vocab = 5000
folder_path = 'tweets_' + str(size_vocab)

In [29]:
def data_preprocess(file_name):
    '''
    input: csv file with keyword, location, and text columns
    output: x_train_padded, y_train, x_valid_padded, y_valid, X_test, y_test, word_list
        x_train_padded: training data (60% of total data) 
        y_train: labels of the training data
        x_valid_padded: validation data (20% of total data)
        y_valid: labels of the validation data
        X_test: hold-out test data (20% of total data)
        y_test: labels of the hold-out test data
        word_list: collection of tokenized words without stop words from training and validation data
    '''
    
    #There is an imbalance in true and false data. Disaster tweets have less than non-disater ones.
    #By adding randomly selected disaster tweets (duplicates), the total data set will be balanced.
    def pos_oversampling(df):
        add_pos = 4342 - 3271
        pos_df = df[df["target"] == 1]
        neg_df = df[df["target"] == 0]
        pos_df.reset_index(inplace=True)
        pos_add_indices = np.random.choice(pos_df.index, add_pos, replace=False)
        pos_add_df = pos_df.iloc[pos_add_indices]
        pos_oversampled = pd.concat([pos_df, pos_add_df], ignore_index=True)
        balanced_df = pd.concat([neg_df, pos_oversampled], ignore_index=True)
        return balanced_df
      
    def tokenize_tweets(tweets):
        tokenized_tweets = []
        for tweet in tweets:
            tweet = re.sub(r'[,!?;-]', '.', tweet) #  Punctuations are replaced by "."
            #tweet_lowered = tweet.lower()
            tokenized_tweet = nltk.word_tokenize(tweet) 
            #  Lower case and drop non-alphabetical tokens
            tokenized_tweet = [ch.lower() for ch in tokenized_tweet if ch.isalpha() or ch == '.']  
            tokenized_tweets.append(tokenized_tweet)
        return tokenized_tweets

    def stopwords_tweets(tokenized_tweets):
        english_stopwords = stopwords.words('english')
        tokens_wo_stopwords_tweets = []
        for tokenized_tweet in tokenized_tweets:
            tokens_wo_stopwords = [t for t in tokenized_tweet if t not in english_stopwords] 
            tokens_wo_stopwords_tweets.append(tokens_wo_stopwords) 
        return tokens_wo_stopwords_tweets

    # flatten the embedded lists to create one long word list.
    def flatten(xss):
        return [x for xs in xss for x in xs]

    # Here tweet in the argument is a tokenized tweet without stop words.
    def padded_vector(tweet, vocab_dict, max_len):
        int_tweet = []
        if len(tweet) <= max_len:
            for word in tweet:
                if word in vocab_dict:
                    int_tweet.append(vocab_dict[word])  
        else:  # tweet is longer than maximum length 
            for word in tweet[:max_len]:  # truncate the tweet
                if word in vocab_dict:
                    int_tweet.append(vocab_dict[word])             
        padded_vector = int_tweet + [0] * max(0, max_len - len(int_tweet))
        return padded_vector
    
    df = pd.read_csv(file_name)

    # oversample the positive tweets
    balanced_df = pos_oversampling(df)
    
    # replace empty cells with a strin "NA"
    balanced_df.fillna('NA', inplace=True)

    # Concatenate keyword, location, and text and name the new column as tweet.
    balanced_df['tweet'] = balanced_df['keyword'] + ' ' + balanced_df['location'] + ' ' + balanced_df['text']

    # splitting data; train 60%, valid 20%, and test 20%
    X = balanced_df['tweet'].values
    y = balanced_df['target'].values
    X_tr, X_test, y_tr, y_test = train_test_split(X, y, test_size=0.20, random_state=38)
    X_train, X_valid, y_train, y_valid = train_test_split(X_tr, y_tr, test_size=0.25, random_state=28)

    #Create word corpus using X_train
    tokenized_tweets_train = tokenize_tweets(X_train)
    tokenized_tweets_train_wo_stopwords = stopwords_tweets(tokenized_tweets_train)
    word_list = flatten(tokenized_tweets_train_wo_stopwords)
    vocab_count = Counter(word_list)

    #Let's use 1000 most popular words
    vocab =  sorted(vocab_count,key=vocab_count.get,reverse=True)[:size_vocab]
    vocab_dict = {w:i+1 for i,w in enumerate(vocab)}

    #Tweet length statistics in train, valid, and test sets
    tokenized_tweets_valid = tokenize_tweets(X_valid)
    tokenized_tweets_valid_wo_stopwords = stopwords_tweets(tokenized_tweets_valid)
    tokenized_tweets_test = tokenize_tweets(X_test)
    tokenized_tweets_test_wo_stopwords = stopwords_tweets(tokenized_tweets_test)

    # maximum length of tweets 
    max_len = max(len(w) for w in tokenized_tweets_train_wo_stopwords + tokenized_tweets_valid_wo_stopwords + tokenized_tweets_test_wo_stopwords)

    # Using this max_len, let's build padded vectors
    x_train_padded = [padded_vector(x, vocab_dict, max_len) for x in tokenized_tweets_train_wo_stopwords]
    x_valid_padded = [padded_vector(x, vocab_dict, max_len) for x in tokenized_tweets_valid_wo_stopwords]

    return x_train_padded, y_train, x_valid_padded, y_valid, X_test, y_test, word_list

In [30]:
x_train_padded, y_train, x_valid_padded, y_valid, X_test, y_test, word_list = data_preprocess(file_name)

In [31]:
print(f" The length of train padded: {len(x_train_padded)}")
print(f" The length of valid padded: {len(x_valid_padded)}")

 The length of train padded: 5210
 The length of valid padded: 1737


In [32]:
def list_to_csv(list_data, folder_path, file_name):
    df_list = pd.DataFrame(list_data)
    df_list.to_csv(f'{folder_path}/{file_name}', index=False)
    return None

In [None]:
# Save the pre-processed data as csv files for later use.

In [33]:
list_to_csv(x_train_padded, folder_path, 'x_train_padded.csv')
list_to_csv(y_train, folder_path, 'y_train.csv')
list_to_csv(x_valid_padded, folder_path, 'x_valid_padded.csv')
list_to_csv(y_valid, folder_path, 'y_valid.csv')
list_to_csv(X_test, folder_path, 'X_test.csv')
list_to_csv(y_test, folder_path, 'y_test.csv')
list_to_csv(word_list, folder_path, 'word_list.csv')

### Create final dataset for training

In [34]:
batch_size = 50

In [35]:
x_train_padded = np.array(x_train_padded)
x_valid_padded = np.array(x_valid_padded)

In [36]:
# Create tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_padded), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(x_valid_padded), torch.from_numpy(y_valid))

In [37]:
# Set up dataloader with shuffle on
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)
valid_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=True, drop_last=True)

In [38]:
# Display tweet and label.
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([50, 28])
Labels batch shape: torch.Size([50])


In [39]:
tweet_0 = train_features[0].squeeze()
label_0 = train_labels[0]
print(f"Tweet: {tweet_0}\nLabel: {label_0}")

Tweet: tensor([ 220,  691,  507,   15,    1, 1681,    1,  566,  220,  690,  909,    1,
        1326,  647,  551, 1682,  460,    1,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])
Label: 0
