In [None]:
# NLTK Stands for Natural Language Toolkit
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

# Import OS Library
import os

# Import string library
import string

# Import pandas
import pandas as pd

# Import numpy
import numpy as np
np.random.seed(1234)

# Import math
import math

# Import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfVectorizer
)

sw = stopwords.words('English')
stemmer = PorterStemmer()

import warnings
warnings.filterwarnings('ignore')

## Preparing Features

### Loading the Data

loading disaster tweets:

In [None]:
tweets_data = pd.read_csv('data/tweets_data.csv')

Let's see how many examples we have for each class:

In [None]:
tweets_data.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

***
Now, I'll define a function that will contain several choices regarding the NLP
Pipeline.
<br>
<br>
Ideally, its usually done by building several functions in this pipeline, probably as a class, but I had done this with a single big function that can be controled using parameters to make it easier.
***

### Building the Experiments

In [None]:
def train_model(
    data,
    stop_words,
    stem,
    remove_punct,
    vectorizer,
    min_df,
    ngram
):
    '''
    Builds pipeline, trains a model and evaluates 
    on training and test set.
    
    Each step is fired according to values coming from the
    parameters.
    '''
    
    # Tokenizing the Text
    tokenized_tweets = (
        data["text"].apply(word_tokenize)
    )
    
    # Check if Stop Words should be removed
    if stop_words:
        tokenized_tweets = (
            tokenized_tweets.apply(lambda x: [word.lower() for word in x if word not in sw])
        )
        
    # Check if we should stem words
    if stem:
        tokenized_tweets = (
            tokenized_tweets.apply(lambda x: [stemmer.stem(word) for word in x])
        )

    tokenized_tweets_sentence = (
        tokenized_tweets.apply(lambda x: ' '.join(x))
    )
    
    # Check if we should remove punctuation
    if remove_punct:

        tokenized_tweets_sentence = tokenized_tweets_sentence.apply(lambda x: 
            x.translate(
                str.maketrans('', '', string.punctuation)
            )
        )

    # Checks what vectorizer we should apply
    if vectorizer == 'count':
        cv = CountVectorizer(min_df = min_df, ngram_range = ngram)
        text_data = cv.fit_transform(tokenized_tweets_sentence).todense()
        
    if vectorizer == 'tfidf':
        cv = TfidfVectorizer(min_df = min_df, ngram_range = ngram)
        text_data = cv.fit_transform(tokenized_tweets_sentence).todense()  
        
    if vectorizer == 'binary':
        cv = CountVectorizer(binary=True, min_df = min_df, ngram_range = ngram)
        text_data = cv.fit_transform(tokenized_tweets_sentence).todense()

    # Builds features
    X = pd.DataFrame(
        text_data,
        columns=cv.get_feature_names()
    )
    # Builds target
    y = data.target

    # Divide into train and test
    X_train, X_test, y_train, y_test = (
        train_test_split(X, y, test_size = 0.2, random_state=1234)
    )
    # Build Model
    lm = LogisticRegression(random_state=1234)
    lm.fit(X_train, y_train)
    y_train_pred = lm.predict(X_train)
    y_test_pred = lm.predict(X_test)

    # Check the accuracy score for train and test
    train_score = accuracy_score(y_train, y_train_pred)
    test_score = accuracy_score(y_test, y_test_pred)
    
    return train_score, test_score

In [None]:
# Build experiments with different parameters
experiments = {
    '1': {'stop_words': True, 'stem': True, 'remove_punct': True, 'vectorizer': 'count', 'min_df': 0.02, 'ngram' : (1,1)},
    '2': {'stop_words': True, 'stem': True, 'remove_punct': False, 'vectorizer': 'count', 'min_df': 0.02, 'ngram' : (1,1)},
    '3': {'stop_words': True, 'stem': True, 'remove_punct': False, 'vectorizer': 'count', 'min_df': 0.005, 'ngram' : (1,1)},
    '4': {'stop_words': True, 'stem': True, 'remove_punct': False, 'vectorizer': 'count', 'min_df': 2, 'ngram' : (1,1)},
    '5': {'stop_words': True, 'stem': True, 'remove_punct': False, 'vectorizer': 'tfidf', 'min_df': 0.005, 'ngram' : (1,1)},
    '6': {'stop_words': True, 'stem': True, 'remove_punct': False, 'vectorizer': 'tfidf', 'min_df': 0, 'ngram' : (1,1)},
    '7': {'stop_words': True, 'stem': False, 'remove_punct': False, 'vectorizer': 'tfidf', 'min_df': 0, 'ngram' : (1,1)},
    '8': {'stop_words': False, 'stem': False, 'remove_punct': False, 'vectorizer': 'tfidf', 'min_df': 0, 'ngram' : (1,1)},
    '9': {'stop_words': False, 'stem': True, 'remove_punct': False, 'vectorizer': 'tfidf', 'min_df': 0, 'ngram' : (1,1)},
    '10': {'stop_words': False, 'stem': True, 'remove_punct': False, 'vectorizer': 'binary', 'min_df': 0, 'ngram' : (1,1)}
}

In [None]:
# Run experiments for multiple parameters
for exp in experiments.items():
    parameters = exp[1]
    print('for experiment '+exp[0]+' the results are '+str(train_model(
        tweets_data,
        parameters['stop_words'],
        parameters['stem'],
        parameters['remove_punct'],
        parameters['vectorizer'],
        parameters['min_df'],
        parameters['ngram']
    )))
    

for experiment 1 the results are (0.6336617405582923, 0.6250820748522653)
for experiment 2 the results are (0.6366174055829228, 0.6172028890347997)
for experiment 3 the results are (0.8060755336617406, 0.7846355876559422)
for experiment 4 the results are (0.9313628899835796, 0.8030203545633617)
for experiment 5 the results are (0.7986863711001642, 0.7846355876559422)
for experiment 6 the results are (0.8863711001642036, 0.8082731451083388)
for experiment 7 the results are (0.8893267651888341, 0.8030203545633617)
for experiment 8 the results are (0.8906403940886699, 0.7984241628365069)
for experiment 9 the results are (0.8855500821018062, 0.8082731451083388)
for experiment 10 the results are (0.9650246305418719, 0.8056467498358503)


The experiment with best score on the training set is **experiment number 10.**
<br>
The experiment with best balance between training and test accuracy is **experiment number 6.**