In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style = "darkgrid")
# !pip install datawig
# import datawig # impute missing values 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
path = "../input/porto-seguro-safe-driver-prediction/train.csv"
data = pd.read_csv(path)

In [None]:
data.head(10) # display top 10 rows

In [None]:
data = data.drop(['id'], axis = 1)

In [None]:
# number of rows and columns in dataset
rows = data.shape[0]
columns = data.shape[1]
print("Data has {} rows, {} columns".format(rows, columns))

In [None]:
data.info()

# DATA PREPROCESSING

## HANDLE NULLS

In [None]:
# number of nulls in dataset
nulls = (data.isna().sum()/rows)*100
nulls

IT IS MENTIONED IN DATA DESCRIPTION THAT -1 REPRESENTS MISSING VALUES

In [None]:
# replace -1 with NaN
data = data.replace(to_replace = -1, value = np.nan)
# calculate nulls count
nulls = (data.isna().sum()/rows)*100
nulls

In [None]:
# threshold value for nulls %
null_threshold = 15
# columns to drop with nulls % greater than threshold
drop_nulls = []
# columns with null % less than threshold (to be imputed)
retain_nulls = []

print("Columns with nulls more than threshold :\n")
for i in nulls.index:
    if(nulls[i]>null_threshold):
        print(i, nulls[i])
        drop_nulls.append(i)
    elif(nulls[i]>0):
        retain_nulls.append(i)


In [None]:
data = data.drop(drop_nulls, axis = 1)

## SPLIT INTO TRAIN & VALIDATION PARTS

In [None]:
from sklearn.model_selection import train_test_split

# dependent variables
X = data.drop(['target'], axis = 1)
# independent variable
y = data['target']

# split data into train and validation part
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, random_state = 5)

# 

## HANDLE CATEGORICAL VARIABLES

In [None]:
# categorical columns
cat_columns = []

for i in X_train.columns:
    if('cat' in i):
        cat_columns.append(i)

cat_columns

In [None]:
# convert list to np.array
cat_columns = np.array(cat_columns)

# not using the following columns since they are of binary nature thus no need to encode
cat_columns = cat_columns[(cat_columns!='ps_ind_04_cat') & (cat_columns!='ps_car_02_cat') & 
           (cat_columns!='ps_car_07_cat') & (cat_columns!='ps_car_08_cat')]

In [None]:
# encode categorical variable using LeaveOneOutEncoding technique
# !pip install category_encoders
from category_encoders import LeaveOneOutEncoder

# intialise encoder
encoder = LeaveOneOutEncoder(cols = cat_columns)

In [None]:
# fit and encode data
X_train = encoder.fit_transform(X_train, y_train)
X_val = encoder.transform(X_val)

In [None]:
# columns to use as input to imputer
imputer_columns = []

for i in X_train.columns:
    if(i not in retain_nulls):
        imputer_columns.append(i)

In [None]:
!pip install datawig
import datawig

def ImputeNulls(train, val, imputer_columns, output_column):
    ''' 
    Replaces nulls in output_column
    
    Args:
        train, val (DataFrame) : Training and validation datasets
    
    Returns:
        tuple : train and validation datasets with imputed values
    '''
#     intialise imputer
    imputer = datawig.SimpleImputer(
        input_columns=imputer_columns,
        output_column=output_column
        )
#     fit 
    imputer.fit(train_df = train)
#     impute missing values
#     imputer = datawig.SimpleImputer.load('./ps_car_14')
    train = imputer.predict(train)
    val = imputer.predict(val)
    return (train, val)

In [None]:
# map columns with imputed values to fill against nulls
fill_nulls = {}

for i in retain_nulls[:-1]:
    fill_nulls[i] = X_train[i].median()
    X_train[i] = X_train[i].fillna(value=fill_nulls[i])
    X_val[i] = X_val[i].fillna(value=fill_nulls[i])

In [None]:
X_train, X_val = ImputeNulls(X_train, X_val, imputer_columns, retain_nulls[-1])
X_train = X_train.drop(['ps_car_14'], axis = 1)
X_val = X_val.drop(['ps_car_14'], axis = 1)

In [None]:
def handleOutliers(data, to_return = False):
    ''' 
    Removes outliers from each column and reports the data loss
    
    Args:
        data (DataFrame) : The DataFrame to remove outliers from
        to_return (bool) :  - Default value False
                            - Whether to return the DataFrame after removing outliers
    
    Returns:
        DataFrame : data free from outliers
    '''
#     calculate first quantile
    Q1 = data.quantile(0.25)
#     calculate third quantile
    Q3 = data.quantile(0.75)
#     calculate inter quartile range
    IQR1 = Q3-Q1

#     initialise data w/o outliers (drop outliers)
    data_c = data[~((data < (Q1-1.5*IQR1))|(data > (Q3+1.5*IQR1))).any(axis = 1)] 
    
#     report data loss
    print('Data loss is {}%'.format(((len(data) - len(data_c))/len(data))*100))
    
    if(to_return):
        return data_c.reset_index(drop = True)

In [None]:
handleOutliers(X_train)

In [None]:
def countOutliers(data, column):
    ''' 
    Calculates the number of outliers in given column
    
    Args:
        data (DataFrame) : The dataset in form of Pandas DataFrame
        column (string) : The column to report number of outliers in
    
    Returns:
        int : percentage of outliers in column
    '''
#     calculate first quantile
    Q1 = data[column].quantile(0.25)
#     calculate third quantile
    Q3 = data[column].quantile(0.75)
#     calculate inter quartile range
    IQR1 = Q3-Q1
    
#     % of outliers in the column
    return (len(data[((data[column] < (Q1-1.5*IQR1))|(data[column] > (Q3+1.5*IQR1)))])/len(data))*100

In [None]:
# percentage of outliers in each column
outliers = {}

for column in X_train.columns:
    outliers[column] = countOutliers(X_train, column)

In [None]:
# sort in decreasing order
outliers = dict(sorted(outliers.items(), key=lambda item: item[1], reverse = True))

In [None]:
def OutliersInfo(threshold_outliers, outliers):
    '''
    Finds number of columns in data with more than threshold percentage of outliers
    
    Args:
        thershold_outliers (int) : maximum percentage of outliers acceptable in dataset
        outliers (dict) : map of columns with number of outliers in each
    
    Returns:
        list : Columns with more than thershold percent of outliers
    '''

#     remove columns with more than threshold
    to_drop_outliers = []

    for i in outliers:
        if(outliers[i] <= threshold_outliers):
            break
        elif(i != 'target'):
            to_drop_outliers.append(i)
            
    return to_drop_outliers

In [None]:
# thersholds to check
thresholds_outliers = [i for i in range(21)]
# number of columns for each threshold
threshold_outliers_values = []

for i in thresholds_outliers:
    threshold_outliers_values.append(len(OutliersInfo(i, outliers)))
    
# plot
sns.lineplot(x=thresholds_outliers, y=threshold_outliers_values)
plt.xlabel("Thresholds")
plt.ylabel("Columns")
plt.show()

In [None]:
threshold_outliers = 4

# columns with more than threshold of outliers
drop_outliers = OutliersInfo(threshold_outliers, outliers)

print("Columns with more than {}% of values as Outliers are {}".format(threshold_outliers, len(drop_outliers)))

In [None]:
# drop outliers
X_train = X_train.drop(drop_outliers, axis = 1)
X_val = X_val.drop(drop_outliers, axis=1)

## Handle Constant Valued columns

In [None]:
#columns with constant value
drop_constant_valued = ['ps_ind_02_cat', 'ps_ind_10_bin', 'ps_ind_11_bin'
                       , 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14']

X_train = X_train.drop(drop_constant_valued, axis=1)
X_val = X_val.drop(drop_constant_valued, axis=1)

In [None]:
X_train = handleOutliers(X_train, True)

In [None]:
!pip install dataprep
from dataprep.eda import plot
plot(X_train)