In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/sample_submission.csv


In [12]:
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
train_data.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


### Convert each tweet to words

A method to remove all html tags, and tokenize the tweet. Remove stop words

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def tweet_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [4]:
# testing on one tweet
tweet_to_words(train_data.text[10])

['three', 'peopl', 'die', 'heat', 'wave', 'far']

In [18]:
# Apply it on all tweets

words_train = [tweet_to_words(tweet) for tweet in train_data.text]
words_test = [tweet_to_words(tweet) for tweet in test_data.text]

### Create a word dictionary


In [9]:
#checking any one tweet
words_train[10]

['three', 'peopl', 'die', 'heat', 'wave', 'far']

In [14]:
# Get all the vocab from all the tweets
# list the top 500 frequent words. with the count of occurance

def build_dict(data, vocab_size = 500):
    """Construct and return a dictionary mapping each of the most frequently appearing words to a unique integer."""
    
    # Determine how often each word appears in `data`. Note that `data` is a list of sentences and that a
    # sentence is a list of words.
    '''
    Creating a flat list - 
    for sublist in words_train:
        for item in sublist:
            flat_list.append(item)
        
    '''
    flat_list = [item for sublist in data for item in sublist]
    word_value, frequency = np.unique(flat_list, return_counts=True)
    word_count = dict(zip(word_value, frequency)) # A dict storing the words that appear in the reviews along with how often they occur
    
    # TODO: Sort the words found in `data` so that sorted_words[0] is the most frequently appearing word and
    #       sorted_words[-1] is the least frequently appearing word.
    
    
    sorted_list = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
    sorted_words = [item[0] for item in sorted_list]
    
    word_dict = {} # This is what we are building, a dictionary that translates words into integers
    for idx, word in enumerate(sorted_words[:vocab_size - 2]): # The -2 is so that we save room for the 'no word'
        word_dict[word] = idx + 2                              # 'infrequent' labels
        
    return word_dict

In [16]:
word_dict = build_dict(words_train)
print(word_dict)

{'co': 2, 'http': 3, 'like': 4, 'fire': 5, 'get': 6, 'bomb': 7, 'new': 8, 'via': 9, '2': 10, 'one': 11, 'go': 12, 'peopl': 13, 'news': 14, 'kill': 15, 'burn': 16, 'year': 17, 'video': 18, 'flood': 19, 'time': 20, 'crash': 21, 'emerg': 22, 'disast': 23, 'bodi': 24, 'attack': 25, 'build': 26, 'day': 27, 'fatal': 28, 'look': 29, 'say': 30, 'home': 31, 'love': 32, 'polic': 33, 'would': 34, '3': 35, 'u': 36, 'make': 37, 'famili': 38, 'evacu': 39, 'still': 40, 'storm': 41, 'train': 42, 'see': 43, 'us': 44, 'come': 45, 'back': 46, 'know': 47, 'california': 48, 'suicid': 49, '1': 50, 'bag': 51, 'live': 52, 'watch': 53, 'want': 54, 'collaps': 55, 'man': 56, 'world': 57, 'car': 58, 'death': 59, 'derail': 60, 'scream': 61, 'got': 62, 'rt': 63, 'first': 64, 'take': 65, 'caus': 66, 'let': 67, 'think': 68, 'nuclear': 69, 'two': 70, 'drown': 71, 'today': 72, 'war': 73, 'need': 74, 'work': 75, 'accid': 76, 'dead': 77, 'wreck': 78, 'deton': 79, 'youtub': 80, 'destroy': 81, '4': 82, '5': 83, 'hijack': 8

In [17]:
# Transform all tweets with each word to an integer corresponding to its rank in the words_dict
# pad is 280 as the max word limit for each 

def convert_and_pad(word_dict, sentence, pad=280):
    NOWORD = 0 # We will use 0 to represent the 'no word' category
    INFREQ = 1 # and we use 1 to represent the infrequent words, i.e., words not appearing in word_dict
    
    working_sentence = [NOWORD] * pad
    
    for word_index, word in enumerate(sentence[:pad]):
        if word in word_dict:
            working_sentence[word_index] = word_dict[word]
        else:
            working_sentence[word_index] = INFREQ
            
    return working_sentence, min(len(sentence), pad)

def convert_and_pad_data(word_dict, data, pad=280):
    result = []
    lengths = []
    
    for sentence in data:
        converted, leng = convert_and_pad(word_dict, sentence, pad)
        result.append(converted)
        lengths.append(leng)
        
    return np.array(result), np.array(lengths)

In [19]:
train_X, train_X_len = convert_and_pad_data(word_dict, words_train)
test_X, test_X_len = convert_and_pad_data(word_dict, words_test)

In [None]:
print(train_X[200])
print(train_X_len[200])