## Importing the necessary libraries

In [30]:
#basic libraries
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import math
import pandas as pd
import re

In [31]:
#text processing libraries
import nltk

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer


#stopwords
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from string import punctuation
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

## Loading the training dataset

Here the dataset of all tweets is loaded and blank fields are filled with the empty string. Also, two other columns *tokenized_text* and *tokenized_key* are created to store the tokenized words for both text sentence of the tweet and the keywords, respectively.

In [32]:
data = pd.read_csv('./train.csv')
data.fillna('', inplace=True)

data['tokenized_text'] = " "
data['tokenized_key'] = " "

## Creating the STOPWORDS

*stop_words*, *punctuation*, *gensimwords*, *sklearnwords*, *num_pattern* are regarded as general words like she, he, I will be ignored. Also, *punctuation* referred for the punctuations like ",?!#@, etc. *gensimwords* and *sklearnwords* are the extended version of words like nonetheless, although, otherwise, etc. *num_pattern* finds the numbers and replaces with the empty string.

In [33]:
stop_words = set(stopwords.words('english'))
punctuation = list(punctuation)
gensimwords = STOPWORDS
sklearnwords = ENGLISH_STOP_WORDS
num_pattern = r'[0-9]'

# Function to tokenize/split the words and ignore the STOPWORDS

Here *PorterStemmer* is an object to get the root or basis of words. For instance, playing will be considered as play, cats as cat. Using the functional library *word_tokenize*, the sentence is converted into the tokens or split words and afterwards, the STOPWORDS and stemming are done.

In [34]:

porter = PorterStemmer()
def tokenized_stop(string):
    string = re.sub(num_pattern, '', string)
    string = re.sub(r'http\S+', '', string)
    
    #tokenizing the words
    string = word_tokenize(string)
    
    #ignoring the unnecessary words
    string_list = []
    for words in string:
        words = words.casefold()
        if (words in stop_words) or (words in punctuation) or (words in gensimwords) or (words in sklearnwords):
            pass
        else:
            words = porter.stem(words)
            string_list.append(words)
             
    return string_list



## Storing the tokenized sentence and keywords to newly created columns

In [35]:
for i in range(len(data['text'])):
    data.at[i,'tokenized_text'] = tokenized_stop(data['text'][i])
    data.at[i,'tokenized_key'] = tokenized_stop(data['keyword'][i])

## Combining all the sentences in a list for the categorical features assignment

In [36]:
all_sents=[]
for i in range(len(data['tokenized_text'])):
    string=''
    for j in data['tokenized_text'][i]:
        string = string  + j + ' '
    all_sents.append(string)


## Example of assigning the numerics to the words using *sklearn*

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
#vectorizer.fit(all_words)
X_train_counts = vectorizer.fit_transform(all_sents)

## Example of converting the numerics into weighted average values using *sklearn*

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

## Training input and output variables

In [39]:
x_train = all_sents
y_train = data['target']


## Pipeline of the model

This pipeline contains categorical feature conversion and grid search model of KNeighbors classifier. The pipeline simplifies the long textual code into single line.

In [49]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer()),\
                     ('tfidf', TfidfTransformer()),\
                     ('clf', KNeighborsClassifier())])

Assigning the variables for all parameters given in the pipeline model.

In [58]:
parameters = {'tfidf__use_idf': (True, False),\
               'clf__weights': ('uniform','distance'),\
                'clf__algorithm': ('ball_tree','kd_tree','brute','auto'),\
             'clf__n_neighbors': (300,1000) }

## Running the KNeighbors pipeline for the training dataset using GridSearchCV

In [59]:
grid_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
grid_clf = grid_clf.fit(x_train, y_train)



## Best score of the training sample and best parameters

In [60]:
print(grid_clf.best_score_)
print(grid_clf.best_params_)

0.6990759299156257
{'clf__algorithm': 'ball_tree', 'clf__n_neighbors': 300, 'clf__weights': 'distance', 'tfidf__use_idf': True}


## Loading the test dataset

In [61]:
test_data = pd.read_csv('./test.csv')
test_data.fillna('', inplace=True)

test_data['tokenized_text'] = " "
test_data['tokenized_key'] = " "


## Implementing the same text processing to the test dataset

In [62]:
for i in range(len(test_data['text'])):
    test_data.at[i,'tokenized_text'] = tokenized_stop(test_data['text'][i])
    test_data.at[i,'tokenized_key'] = tokenized_stop(test_data['keyword'][i])

In [63]:
all_sents=[]
for i in range(len(test_data['tokenized_text'])):
    string=''
    for j in test_data['tokenized_text'][i]:
        string = string  + j + ' '
    all_sents.append(string)

## Test data input

In [64]:
x_test = all_sents

## Prediction of the accidents or natural disasters from the textual tweets

In [65]:
predicted = grid_clf.predict(x_test)
print(predicted)

[1 1 1 ... 1 1 0]
