# Feature Engineering

## import libs

In [1]:
import re
import string
from nltk.corpus import stopwords

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

## prepare data

In [4]:
df = pd.read_csv('data/train.csv', index_col='id')
df['words_count'] = df.text.apply(len)

In [3]:
def text_process(mess: str):
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    nopunc = nopunc.lower().strip()

    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])


df['clean_text'] = df['text'].apply(text_process)
df['clean_words_count'] = df['clean_text'].apply(len)

## add some features

because location column has a lot of missing values (49%) we will use the has_location instead. after that we will delete this column 

In [4]:
df['has_location'] = df['location'].notnull()
del df['location']

additionally these are other symbols, which are in the text and can be used as a mean to predict whether the tweet is disaster or not.

In [5]:
df['has_question_mark'] = df['text'].str.contains('\?').astype(int)
df['has_exclamation_mark'] = df['text'].str.contains('\!').astype(int)
df['has_hashtag'] = df['text'].str.contains('\#').astype(int)
df['has_capital_words'] = df['text'].apply(lambda x: str(x).isupper()).astype(int)
df['has_link'] = df['text'].str.contains(
    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+').astype(int)

In [6]:
df.head()

Unnamed: 0_level_0,keyword,text,target,words_count,clean_text,clean_words_count,has_location,has_question_mark,has_exclamation_mark,has_hashtag,has_capital_words,has_link
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,,Our Deeds are the Reason of this #earthquake M...,1,69,deeds reason earthquake may allah forgive us,44,False,0,0,1,0,0
4,,Forest fire near La Ronge Sask. Canada,1,38,forest fire near la ronge sask canada,37,False,0,0,0,0,0
5,,All residents asked to 'shelter in place' are ...,1,133,residents asked shelter place notified officer...,88,False,0,0,0,0,0
6,,"13,000 people receive #wildfires evacuation or...",1,65,13000 people receive wildfires evacuation orde...,59,False,0,0,1,0,0
7,,Just got sent this photo from Ruby #Alaska as ...,1,88,got sent photo ruby alaska smoke wildfires pou...,55,False,0,0,1,0,0


in order to work with text data, we should convert them into numerical features so that they can be understood by the machine learning models. Like `DictVectorizer` from `sklearn` package which converts enum columns into numerical features, `CountVectorizer` can be used inorder to convert text data into numerical features. Each words have its own column/feature and if that word exists in a row, the value will be 1, otherwise 0

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(df['clean_text'])
representation = vect.transform(df['clean_text']).toarray()

In [37]:
print(df.iloc[0]['text'])
firstRowText = representation[0]
tokenIndices = [i for i, x in enumerate(firstRowText) if x == 1]
vect.get_feature_names_out()[tokenIndices]

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all


array(['allah', 'deeds', 'earthquake', 'forgive', 'may', 'reason', 'us'],
      dtype=object)

as we see the first row text was "Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all"
after removing the stop words, the tokens are "allah", "deeds", "earthquake", "forgive", "may", "reason" and "us"

## train, validation and test data sets split

In [2]:
df = pd.read_csv('./data/train.csv')

In [3]:
from sklearn.model_selection import train_test_split

random_state_seed = 22
df_train_validation, df_test = train_test_split(df, test_size=0.2, random_state=random_state_seed)
df_train, df_validation = train_test_split(df_train_validation, test_size=0.25, random_state=random_state_seed)

### converting data frame to desired input of the model

In [4]:
class InputProvider(object):
    def get_train_inputs(self, df: pd.DataFrame) -> tuple:
        pass

    def get_test_inputs(self, df: pd.DataFrame) -> tuple:
        pass

In [5]:
def get_clean_text(mess: str):
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    nopunc = nopunc.lower().strip()

    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])

In [6]:
def add_new_features_from_text(df_original: pd.DataFrame) -> pd.DataFrame:
    df = df_original.copy()
    df['words_count'] = df.text.apply(len)

    df['has_location'] = df['location'].notnull().astype(int)
    del df['location']
    df['has_question_mark'] = df['text'].str.contains('\?').astype(int)
    df['has_exclamation_mark'] = df['text'].str.contains('\!').astype(int)
    df['has_hashtag'] = df['text'].str.contains('\#').astype(int)
    df['has_capital_words'] = df['text'].apply(lambda x: str(x).isupper()).astype(int)
    df['has_link'] = df['text'].str.contains(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+').astype(int)

    return df


def add_clean_text_features(df_original: pd.DataFrame) -> pd.DataFrame:
    df = df_original.copy()
    df['clean_text'] = df['text'].apply(get_clean_text)
    df['clean_words_count'] = df['clean_text'].apply(len)
    return df

InputProvider1 feature list:
- words_count
- has_location
- has_question_mark
- has_exclamation_mark
- has_hashtag
- has_capital_words
- has_link

dimension input matrix: n_rows * 7

In [7]:
class InputProvider1(InputProvider):
    @staticmethod
    def _get_input_base(df: pd.DataFrame) -> tuple:
        df = add_new_features_from_text(df)
        y = df.target
        df.drop(columns=['id', 'text', 'keyword', 'target'], inplace=True)
        x = df.values
        return x, y

    def get_train_inputs(self, df: pd.DataFrame) -> tuple:
        return self._get_input_base(df)

    def get_test_inputs(self, df: pd.DataFrame) -> tuple:
        return self._get_input_base(df)

In [8]:
np.shape(InputProvider1().get_train_inputs(df_train)[0])

(4567, 7)

InputProvider2 feature list:
- words_count
- has_location
- has_question_mark
- has_exclamation_mark
- has_hashtag
- has_capital_words
- has_link
- keywords **(one hot encoding)** 

dimension input matrix: n_rows * 229

In [9]:
from sklearn.feature_extraction import DictVectorizer


class InputProvider2(InputProvider):
    def __init__(self):
        self.vect = DictVectorizer()

    def get_train_inputs(self, df: pd.DataFrame) -> tuple:
        df = add_new_features_from_text(df)
        y = df.target
        df.drop(columns=['id', 'text', 'target'], inplace=True)
        self.vect.fit(df.to_dict(orient='records'))
        x = self.vect.transform(df.to_dict(orient='records'))
        return x, y

    def get_test_inputs(self, df: pd.DataFrame) -> tuple:
        df = add_new_features_from_text(df)
        y = df.target
        df.drop(columns=['id', 'text', 'target'], inplace=True)
        x = self.vect.transform(df.to_dict(orient='records'))
        return x, y

In [10]:
np.shape(InputProvider2().get_train_inputs(df_train)[0])

(4567, 229)

InputProvider3 feature list:
- clean text tokens

dimension input matrix: n_rows * 15699

In [11]:
from sklearn.feature_extraction.text import CountVectorizer


class InputProvider3(InputProvider):
    def __init__(self):
        self.vect = CountVectorizer()

    def get_train_inputs(self, df: pd.DataFrame) -> tuple:
        df = add_clean_text_features(df)
        y = df.target
        self.vect.fit(df['clean_text'])
        x = self.vect.transform(df['clean_text'])
        return x, y

    def get_test_inputs(self, df: pd.DataFrame) -> tuple:
        df = add_clean_text_features(df)
        y = df.target
        x = self.vect.transform(df['clean_text'])
        return x, y

In [12]:
np.shape(InputProvider3().get_train_inputs(df_train)[0])

(4567, 15699)

InputProvider4 feature list:
- words_count
- has_location
- has_question_mark
- has_exclamation_mark
- has_hashtag
- has_capital_words
- has_link
- clean text tokens 

dimension input matrix: n_rows * 15707

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack


class InputProvider4(InputProvider):
    def __init__(self):
        self.vect = CountVectorizer()

    def get_train_inputs(self, df: pd.DataFrame) -> tuple:
        df = add_new_features_from_text(add_clean_text_features(df))
        y = df.target
        self.vect.fit(df['clean_text'])
        tokens = self.vect.transform(df['clean_text'])
        sparce_features = csr_matrix(df.drop(columns=['text', 'clean_text', 'keyword', 'id', 'target']).values)
        x = hstack([tokens, sparce_features])
        return x, y

    def get_test_inputs(self, df: pd.DataFrame) -> tuple:
        df = add_new_features_from_text(add_clean_text_features(df))
        y = df.target
        tokens = self.vect.transform(df['clean_text'])
        sparce_features = csr_matrix(df.drop(columns=['text', 'clean_text', 'keyword', 'id', 'target']).values)
        x = hstack([tokens, sparce_features])
        return x, y

In [33]:
np.shape(InputProvider4().get_train_inputs(df_train)[0])

(4567, 15707)

InputProvider5 feature list:
- words_count
- has_location
- has_question_mark
- has_exclamation_mark
- has_hashtag
- has_capital_words
- has_link
- clean text tokens 
- keywords

dimension input matrix: n_rows *

In [34]:
class InputProvider5(InputProvider):
    def __init__(self):
        self.count_vect = CountVectorizer()
        self.dict_vect = DictVectorizer()

    def get_train_inputs(self, df: pd.DataFrame) -> tuple:
        df = add_new_features_from_text(add_clean_text_features(df))
        y = df.target
        self.count_vect.fit(df['clean_text'])
        tokens = self.count_vect.transform(df['clean_text'])
        sparce_features = csr_matrix(df.drop(columns=['text', 'clean_text', 'keyword', 'id', 'target']).values)
        x = hstack([tokens, sparce_features])
        return x, y

    def get_test_inputs(self, df: pd.DataFrame) -> tuple:
        df = add_new_features_from_text(add_clean_text_features(df))
        y = df.target
        tokens = self.count_vect.transform(df['clean_text'])
        sparce_features = csr_matrix(df.drop(columns=['text', 'clean_text', 'keyword', 'id', 'target']).values)
        x = hstack([tokens, sparce_features])
        return x, y