# Feature Engineering

## import libs

In [7]:
import re
import string
from nltk.corpus import stopwords

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

## prepare data

In [8]:
df = pd.read_csv('data/train.csv', index_col='id')
df['words_count'] = df.text.apply(len)

In [9]:
def text_process(mess: str):
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    nopunc = nopunc.lower().strip()

    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])


df['clean_text'] = df['text'].apply(text_process)
df['clean_words_count'] = df['clean_text'].apply(len)

## add some features

because location column has a lot of missing values (49%) we will use the has_location instead. after that we will delete this column 

In [10]:
df['has_location'] = df['location'].notnull()
del df['location']

additionally these are other symbols, which are in the text and can be used as a mean to predict whether the tweet is disaster or not.

In [11]:
df['has_question_mark'] = df['text'].str.contains('\?').astype(int)
df['has_exclamation_mark'] = df['text'].str.contains('\!').astype(int)
df['has_hashtag'] = df['text'].str.contains('\#').astype(int)
df['has_capital_words'] = df['text'].apply(lambda x: str(x).isupper()).astype(int)
df['has_link'] = df['text'].str.contains(
    'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+').astype(int)

In [12]:
df.head()

Unnamed: 0_level_0,keyword,text,target,words_count,clean_text,clean_words_count,has_location,has_question_mark,has_exclamation_mark,has_hashtag,has_capital_words,has_link
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,,Our Deeds are the Reason of this #earthquake M...,1,69,deeds reason earthquake may allah forgive us,44,False,0,0,1,0,0
4,,Forest fire near La Ronge Sask. Canada,1,38,forest fire near la ronge sask canada,37,False,0,0,0,0,0
5,,All residents asked to 'shelter in place' are ...,1,133,residents asked shelter place notified officer...,88,False,0,0,0,0,0
6,,"13,000 people receive #wildfires evacuation or...",1,65,13000 people receive wildfires evacuation orde...,59,False,0,0,1,0,0
7,,Just got sent this photo from Ruby #Alaska as ...,1,88,got sent photo ruby alaska smoke wildfires pou...,55,False,0,0,1,0,0


in order to work with text data, we should convert them into numerical features so that they can be understood by the machine learning models. Like `DictVectorizer` from `sklearn` package which converts enum columns into numerical features, `CountVectorizer` can be used inorder to convert text data into numerical features. Each words have its own column/feature and if that word exists in a row, the value will be 1, otherwise 0

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(df['clean_text'])
vect.transform(df['clean_text'])

<7613x22310 sparse matrix of type '<class 'numpy.int64'>'
	with 73854 stored elements in Compressed Sparse Row format>