# Data Preparation

In [1]:
import sys
from copy import deepcopy
import warnings
import numpy as np
from numpy import inf, nan
import pandas as pd
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

sys.path.append("../../")

from mobile_handset_price_model.prediction.transformers import BooleanTransformer

warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

## Load Data

In [2]:
data = pd.read_csv("../../data/train.csv")

data.head()

Unnamed: 0,battery_power,has_bluetooth,clock_speed,has_dual_sim,front_camera_megapixels,has_four_g,internal_memory,depth,weight,number_of_cores,primary_camera_megapixels,pixel_resolution_height,pixel_resolution_width,ram,screen_height,screen_width,talk_time,has_three_g,has_touch_screen,has_wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1


In [3]:
data.dtypes

battery_power                  int64
has_bluetooth                  int64
clock_speed                  float64
has_dual_sim                   int64
front_camera_megapixels        int64
has_four_g                     int64
internal_memory                int64
depth                        float64
weight                         int64
number_of_cores                int64
primary_camera_megapixels      int64
pixel_resolution_height        int64
pixel_resolution_width         int64
ram                            int64
screen_height                  int64
screen_width                   int64
talk_time                      int64
has_three_g                    int64
has_touch_screen               int64
has_wifi                       int64
price_range                    int64
dtype: object

## Select Numerical and Categorical Features

In [4]:
categorical_cols = []

numerical_columns = [
    "battery_power",
    "clock_speed",
    "front_camera_megapixels",
    "internal_memory",
    "depth",
    "weight",
    "number_of_cores",
    "primary_camera_megapixels",
    "pixel_resolution_height",
    "pixel_resolution_width",
    "ram",
    "screen_height",
    "screen_width",
    "talk_time"
]

boolean_columns = [
    "has_bluetooth",
    "has_dual_sim",
    "has_four_g",
    "has_three_g",
    "has_touch_screen",
    "has_wifi",
]

# to make sure that we didnt miss any columns, we'll assert that the counts sum up
assert (len(categorical_cols) + len(numerical_columns) + len(boolean_columns) == len(data.columns) - 1)

## Create Preprocessor for Numerical Features

In [5]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [6]:
# testing the transformer
test_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
                       columns=["a", "b", "c"])

# copying the transformer object in order to fit and test it
numerical_transformer_copy = deepcopy(numerical_transformer)

numerical_transformer_copy.fit(test_df)

test_df = pd.DataFrame([[1, None, 3], [None, 5, 6], [7, 8, None]],
                       columns=["a", "b", "c"])

result = numerical_transformer_copy.transform(test_df)

if np.array_equal(result, np.array([[-1.22474487,  0.0, -1.22474487],[ 0.0, 0.0, 0.0],[ 1.22474487, 1.22474487, 0.0]]), equal_nan=True):
    raise ValueError("Unexpected values found in array.")

## Create Preprocessor for Boolean Features

We'll create a transformer that is able to convert the string in the boolean columns to boolean values.

In [7]:
boolean_transformer = BooleanTransformer(true_value=1, false_value=0)

In [8]:
# testing the transformer
test_df = pd.DataFrame([[1], [0], [1]],
                       columns=["has_bluetooth"])

# copying the transformer object in order to fit and test it
boolean_transformer_copy = deepcopy(boolean_transformer)

boolean_transformer_copy.fit(test_df)

result = boolean_transformer_copy.transform(test_df)

if (result != np.array([[True], [False], [True]])).all():
    raise ValueError("Unexpected values found in array.")

In [9]:
# testing the transformer with boolean values
test_df = pd.DataFrame([[True], [False], [True]],
                       columns=["has_bluetooth"])

# copying the transformer object in order to fit and test it
boolean_transformer_copy = deepcopy(boolean_transformer)

boolean_transformer_copy.fit(test_df)

result = boolean_transformer_copy.transform(test_df)

if (result != np.array([[True], [False], [True]])).all():
    raise ValueError("Unexpected values found in array.")

## Create ColumnTransformer

Combining all of the preprocessors into one ColumnTransformer that can be used to preprocess the data.

In [10]:
column_transformer = ColumnTransformer(
    remainder="passthrough",
    transformers=[
        ("numerical", numerical_transformer, numerical_columns),
        ("boolean", boolean_transformer, boolean_columns)
    ])

## Test the ColumnTransformer

In [11]:
# copying the transformer object in order to fit and test it
column_transformer_copy = deepcopy(column_transformer)

column_transformer_copy.fit(data.head())

result = column_transformer_copy.transform(data.iloc[[6]])

if len(result[0]) != 21:  # expecting 21 features to come out of the ColumnTransformer
    raise ValueError("Unexpected number of columns found in the dataframe.")
    
result

array([[ 1.86531968,  0.38203679,  0.16142686, -1.11613741,  0.68599434,
        -0.44990061,  2.7080128 ,  0.65327458, -1.15510633, -1.0575628 ,
         1.66767126,  0.21887026,  1.39707095,  1.3462912 ,  0.        ,
         0.        ,  1.        ,  1.        ,  0.        ,  1.        ,
         3.        ]])

## Save ColumnTransformer

NOTE: the ColumnTransformer object is saved in an UNFITTED state, it will be fitted to the data set later

In [12]:
joblib.dump(column_transformer, "column_transformer.joblib")

['column_transformer.joblib']