# model selection

## import libs

In [1]:
import re
import string
from abc import ABC

from nltk.corpus import stopwords

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

What we want to do here is to try various models in these data sets, train them and validate the results based on multiple metrics. We also want to play with model inputs to check its result. We also need to tune model hyperparameters. As we are going to train and test the model multiple times with different configs, it would be beneficial to prepare some code in advance in order to prevent rewriting code.

## train, validation and test data sets split

In [2]:
df = pd.read_csv('./data/train.csv')

In [3]:
from sklearn.model_selection import train_test_split

random_state_seed = 22
df_train_validation, df_test = train_test_split(df, test_size=0.2, random_state=random_state_seed)
df_train, df_validation = train_test_split(df_train_validation, test_size=0.25, random_state=random_state_seed)

### converting data frame to desired input of the model

In [4]:
class InputProvider(object):
    def get_train_inputs(self, df: pd.DataFrame) -> tuple:
        pass

    def get_test_inputs(self, df: pd.DataFrame) -> tuple:
        pass

In [5]:
def get_clean_text(mess: str):
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    nopunc = nopunc.lower().strip()

    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])

In [26]:
def add_new_features_from_text(df_original: pd.DataFrame) -> pd.DataFrame:
    df = df_original.copy()
    df['words_count'] = df.text.apply(len)

    df['has_location'] = df['location'].notnull()
    del df['location']
    df['has_question_mark'] = df['text'].str.contains('\?').astype(int)
    df['has_exclamation_mark'] = df['text'].str.contains('\!').astype(int)
    df['has_hashtag'] = df['text'].str.contains('\#').astype(int)
    df['has_capital_words'] = df['text'].apply(lambda x: str(x).isupper()).astype(int)
    df['has_link'] = df['text'].str.contains(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+').astype(int)

    return df


def add_clean_text_features(df_original: pd.DataFrame) -> pd.DataFrame:
    df = df_original.copy()
    df['clean_text'] = df['text'].apply(get_clean_text)
    df['clean_words_count'] = df['clean_text'].apply(len)
    return df

In [27]:
class InputProvider1(InputProvider):
    @staticmethod
    def _get_input_base(df: pd.DataFrame) -> tuple:
        df = add_new_features_from_text(df)
        y = df.target
        del df['target']
        x = df.values
        return x, y

    def get_train_inputs(self, df: pd.DataFrame) -> tuple:
        return self._get_input_base(df)

    def get_test_inputs(self, df: pd.DataFrame) -> tuple:
        return self._get_input_base(df)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer


class InputProvider2(InputProvider):
    def __init__(self):
        self.vect = CountVectorizer()
    
    def get_train_inputs(self, df: pd.DataFrame) -> tuple:
        df = add_clean_text_features(add_new_features_from_text(df))
        self.vect.fit(df['clean_text'])
        representation = self.vect.transform(df['clean_text']).toarray()
    
    def get_test_inputs(self, df: pd.DataFrame) -> tuple:
        pass

In [30]:
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, roc_curve, confusion_matrix, \
    ConfusionMatrixDisplay


class MetricsCalculator(object):
    def __init__(self, y_actual, y_probabilities):
        self.y_true = y_actual
        self.y_pred = y_probabilities

    def get_auc(self):
        return roc_auc_score(self.y_true, self.y_pred)

    def get_accuracy(self):
        return accuracy_score(self.y_true, self.y_pred)

    def get_recall(self):
        return recall_score(self.y_true, self.y_pred)

    def get_precision(self):
        return precision_score(self.y_true, self.y_pred)

    def get_confusion_matrix(self):
        confusion_matrix(self.y_true, self.y_pred)

    def show_confusion_matrix(self):
        ConfusionMatrixDisplay(self.get_confusion_matrix()).plot()
        plt.show()

    def show_roc_curve(self):
        fpr, tpr, _ = roc_curve(self.y_true, self.y_pred)
        plt.figure(figsize=(5, 5))
        plt.plot(fpr, tpr)
        plt.plot([0, 1], [0, 1])

array([['crashed', 119, 1, ..., 1, 0, 0],
       ['crash', 109, 1, ..., 0, 0, 1],
       ['rescuers', 140, 1, ..., 0, 0, 1],
       ...,
       ['war%20zone', 31, 1, ..., 0, 0, 0],
       ['refugees', 101, 1, ..., 1, 0, 1],
       ['ambulance', 64, 0, ..., 0, 0, 0]], dtype=object)

In [31]:
import abc


class ModelInterface:
    @abc.abstractmethod
    def fit(self, x, y):
        pass

    @abc.abstractmethod
    def predict(self, x):
        pass

    def predict_proba(self, x):
        pass

array([['crashed', 119, 1, ..., 1, 0, 0],
       ['crash', 109, 1, ..., 0, 0, 1],
       ['rescuers', 140, 1, ..., 0, 0, 1],
       ...,
       ['war%20zone', 31, 1, ..., 0, 0, 0],
       ['refugees', 101, 1, ..., 1, 0, 1],
       ['ambulance', 64, 0, ..., 0, 0, 0]], dtype=object)

In [11]:
class TrainValidationWorkflow(object):
    def __init__(
            self,
            model: ModelInterface,
            input_provider: InputProvider,
            df_train: pd.DataFrame,
            df_validation: pd.DataFrame = None
    ):
        self.model = model
        self.input_provider = input_provider
        self.df_train = df_train
        self.df_validation = df_validation

    def get_model(self):
        self._train_flow()
        return self.model

    def get_metrics_calculator(self) -> MetricsCalculator:
        self._train_flow()
        assert self.df_validation is not None, 'you should provide the validation dataframe'
        x_validation, y_validation = self.input_provider.get_test_inputs(self.df_validation)
        return MetricsCalculator(y_validation, self.model.predict_proba(x_validation))

    def _train_flow(self):
        x_train, y_train = self.input_provider.get_train_inputs(self.df_train)
        self.model.fit(x_train, y_train)

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

lrmodel = LogisticRegression()
lrflow = TrainValidationWorkflow(model=lrmodel, input_provider=None, df_train=df_train, df_validation=df_validation)