In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## Sklearn modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

1. We will start with removing outliers. So far, we have discussed different methods to remove outliers. Use the one you feel more comfortable with, define a function for that. Use the function to remove the outliers and apply it to the dataframe.
2. Create a copy of the dataframe for the data wrangling.
3. Normalize the continuous variables. You can use any one method you want.
4. Encode the categorical variables
5. The time variable can be useful. Try to transform its data into a useful one. Hint: Day week and month as integers might be useful.
6. Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

In [10]:
## Functions

In [11]:
## Import Data
path = '../lab-cleaning-categorical-data/files_for_lab/'
file_name = 'we_fn_use_c_marketing_customer_value_analysis.csv'
data = pd.read_csv(path+file_name)
data.columns = [i.lower().replace(' ','_') for i in data.columns]

In [12]:
y = data['total_claim_amount']
X = data.drop(['total_claim_amount', 'customer', 'state', 'sales_channel'], axis=1)
X.effective_to_date = pd.to_datetime(X.effective_to_date)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:
## Numerical selection
numerical_train = X_train.select_dtypes(include=[np.number])
numerical_test =  X_test.select_dtypes(include=[np.number])


## Categorical selection
categorical_train = X_train.select_dtypes(exclude=[np.number, 'datetime'])
categorical_test =  X_test.select_dtypes(exclude=[np.number, 'datetime'])

## Datetime selection
datetime_train = X_train.select_dtypes(include=['datetime'])
datetime_test =  X_test.select_dtypes(include=['datetime'])

In [15]:
## This is the day
day_train = datetime_train.applymap(lambda x: x.day).astype(int) ## It would be added to numerical
day_test  = datetime_test.applymap(lambda x: x.day).astype(int)

## This is the month
month_train = datetime_train.applymap(lambda x: x.month).astype(str) ## It would be added to categorical.
month_test  = datetime_test.applymap(lambda x: x.month).astype(str)

## Year is always the same it doesn't add any information.

In [16]:
## concat numerical
numerical_train = pd.concat([numerical_train, day_train], axis=1)
numerical_test  = pd.concat([numerical_test, day_test], axis=1)

## concat categorical
categorical_train = pd.concat([categorical_train, month_train], axis=1)
categorical_test  = pd.concat([categorical_test, month_test], axis=1)

In [17]:
## Normalize Continuous Data
scaler = StandardScaler().fit(numerical_train)
numerical_train = scaler.transform(numerical_train)
numerical_test  = scaler.transform(numerical_test)

In [18]:
## Encode Categorical Data
encoder = OneHotEncoder(drop='first', handle_unknown='error').fit(categorical_train)
categorical_train = encoder.transform(categorical_train)
categorical_test  = encoder.transform(categorical_test)

In [19]:
X_train = np.concatenate([categorical_train.todense(), numerical_train], axis=1)
X_test  = np.concatenate([categorical_train.todense(), numerical_train], axis=1)

## I need to todense() method, because it is was transformed into a sparce matrix

In [20]:
X_train.shape

(6119, 45)

In [23]:
pd.DataFrame(X_train).to_csv('X_train.csv', index=False)
pd.DataFrame(X_test).to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)