In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library
import matplotlib.pyplot as plt # data visualization library
import scipy.stats as stats # library of statistical functions
import warnings
warnings.filterwarnings("ignore") # warnings filter to never print matching warnings

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



Reading Data

In [None]:
df_train = pd.read_csv("/kaggle/input/liberty-mutual-group-property-inspection-prediction/train.csv.zip")
df_test = pd.read_csv("/kaggle/input/liberty-mutual-group-property-inspection-prediction/test.csv.zip")

# Understanding the Data

In [None]:
df_train.info()

In [None]:
df_test.info()

Getting Unique values of each column

In [None]:
for col in df_train.columns: 
    print('{} :{} ' . format(col.upper(),df_train[col].unique()))

Reducing data size

Converting Object data type to Category

In [None]:
objectCol = list(df_train.select_dtypes(include=['object']).columns)

for col in objectCol:
    df_train[col] = df_train[col].astype("category")
    df_test[col] = df_test[col].astype("category")

Changing the numerical data types to reduce size further

In [None]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            elif str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)
df_train.info()

Splitting Column names based on Data Types

In [None]:
df_train=df_train.drop(['Id'],axis=1)
df_test_Id=df_test['Id']
df_test=df_test.drop(['Id'],axis=1)
df_numerical_cols = df_train.select_dtypes(exclude='object').select_dtypes(exclude='category').columns.tolist()
# df_categorical_cols = df.select_dtypes(include='object').columns.tolist()
df_categorical_cols = [i for i in df_train.columns if i not in df_numerical_cols]

In [None]:
df_numerical_cols

In [None]:
df_categorical_cols

Histogram

In [None]:
fig, axs = plt.subplots(len(df_numerical_cols),2)
fig.set_figwidth(10)
fig.set_figheight(120)
i=0
for col in df_numerical_cols:
    sns.histplot(x=df_train[col], ax=axs[i,0],kde=True)
    stats.probplot(df_train[col], dist="norm", plot=axs[i,1])
    i=i+1

Correlation matrix

In [None]:
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(15, 15))
matrix = np.triu(df_train.corr())
sns.heatmap(corrmat, square=True, annot=True, fmt='.1g',  cbar=False, mask=matrix);

No feature is highly correlated

Skewness along the index axis

In [None]:
df_train.skew(axis = 0, skipna = True)

# Data Preprocessing

Removing the skewness with |skewness| > 1

**T2_V2, T2_V6, T2_V8, T2_V14, T2_V15** 
not able to remove skewness of T2_V8

In [None]:
col = 'T2_V2'
fig, axs = plt.subplots(3)
fig.set_figwidth(8)
fig.set_figheight(15)
sns.kdeplot(df_train[col],color='Purple',fill=True, ax=axs[0])
print("Old skew of %s: %.2f" % (col,df_train[col].skew(axis = 0, skipna = True)))
# Removing the skewness using a log function and checking the distribution again
df_train[col] = df_train[col].map(lambda i : np.log(i) if i > 0 else 0)
sns.kdeplot(df_train[col],color='Orange',fill=True, ax=axs[1])
stats.probplot(df_train[col], dist="norm", plot=axs[2])
print("New skew of %s: %.2f" % (col,df_train[col].skew(axis = 0, skipna = True)))
df_test[col] = df_test[col].map(lambda i : np.log(i) if i > 0 else 0)

In [None]:
col = 'T2_V6'
fig, axs = plt.subplots(3)
fig.set_figwidth(8)
fig.set_figheight(15)
sns.kdeplot(df_train[col],color='Purple',fill=True, ax=axs[0])
print("Old skew of %s: %.2f" % (col,df_train[col].skew(axis = 0, skipna = True)))
# Removing the skewness using a log function and checking the distribution again
df_train[col] = df_train[col].map(lambda i : np.log(i) if i > 0 else 0)
sns.kdeplot(df_train[col],color='Orange',fill=True, ax=axs[1])
stats.probplot(df_train[col], dist="norm", plot=axs[2])
print("New skew of %s: %.2f" % (col,df_train[col].skew(axis = 0, skipna = True)))
df_test[col] = df_test[col].map(lambda i : np.log(i) if i > 0 else 0)

In [None]:
col = 'T2_V14'
fig, axs = plt.subplots(3)
fig.set_figwidth(8)
fig.set_figheight(15)
sns.kdeplot(df_train[col],color='Purple',fill=True, ax=axs[0])
print("Old skew of %s: %.2f" % (col,df_train[col].skew(axis = 0, skipna = True)))
# Removing the skewness using a log function and checking the distribution again
df_train[col] = df_train[col].map(lambda i : np.log(i) if i > 0 else 0)
sns.kdeplot(df_train[col],color='Orange',fill=True, ax=axs[1])
stats.probplot(df_train[col], dist="norm", plot=axs[2])
print("New skew of %s: %.2f" % (col,df_train[col].skew(axis = 0, skipna = True)))
df_test[col] = df_test[col].map(lambda i : np.log(i) if i > 0 else 0)

In [None]:
col = 'T2_V15'
fig, axs = plt.subplots(3)
fig.set_figwidth(8)
fig.set_figheight(15)
sns.kdeplot(df_train[col],color='Purple',fill=True, ax=axs[0])
print("Old skew of %s: %.2f" % (col,df_train[col].skew(axis = 0, skipna = True)))
# Removing the skewness using a log function and checking the distribution again
df_train[col] = df_train[col].map(lambda i : np.log(i) if i > 0 else 0)
sns.kdeplot(df_train[col],color='Orange',fill=True, ax=axs[1])
stats.probplot(df_train[col], dist="norm", plot=axs[2])
print("New skew of %s: %.2f" % (col,df_train[col].skew(axis = 0, skipna = True)))
df_test[col] = df_test[col].map(lambda i : np.log(i) if i > 0 else 0)

Splitting columns into features and dependent columns

In [None]:
# creating a copy of dataframe
df1 = df_train

# separating the features and target 
X = df1.drop(['Hazard'],axis=1)
y = df1[['Hazard']]
X_df_test = df_test

In [None]:
df_numerical_cols.remove('Hazard')

Deleting unused variables and collecting memory

In [None]:
import gc
del df1
gc.collect()

Encoding categorical data

In [None]:
# In case when few categorical values are not common in Train set and Test set 
# then training the model using Train set and transforming Test set will result in 
# different number of columns
# Combining training set and test set so make the number of encoded columns same
X['train']=1
X_df_test['train']=0
combined = pd.concat([X,X_df_test])
df = pd.get_dummies(combined[df_categorical_cols])
df_dummies=pd.concat([combined[df_numerical_cols],df,combined['train']],axis=1)
# Splitting training set and test set
X = df_dummies[df_dummies['train']==1]
X_df_test = df_dummies[df_dummies['train']==0]
# Deleting Train column
X.drop(['train'],axis=1,inplace=True)
X_df_test.drop(['train'],axis=1,inplace=True)
X.shape

# Modeling

Checking the accuracy of models on Train data first and then we will use that model to predict the Test outcome.

Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()

for col in df_numerical_cols:
    X_train[col] =  sc.fit_transform(X_train[col].values.reshape(-1,1))
    X_test[col] =  sc.transform(X_test[col].values.reshape(-1,1))
y_train['Hazard'] =  sc.fit_transform(y_train['Hazard'].values.reshape(-1,1))
y_test['Hazard'] =  sc.transform(y_test['Hazard'].values.reshape(-1,1))
# last object is fitted on y, this will be used to inverse transform y_pred

Packages

In [None]:
# Base Models
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

# Ensembling and Boosting
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


the Python implementation from the [Gini coefficient discussion with code samples](https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703):

In [None]:
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

# The higher the better
def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

Base Modeling

In [None]:
models = [
    ('LinearRegression', LinearRegression()),
    ('GradientBoostingRegressor', GradientBoostingRegressor()),
    ('DecisionTreeRegressor',DecisionTreeRegressor(random_state = 0)),
    ('RandomForestRegressor',RandomForestRegressor(n_estimators = 10, random_state = 0))
]

Gini score

In [None]:
print("The accuracy scores of the models are :")
for model_name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_test['Hazard'] =  sc.inverse_transform(y_test['Hazard'].values)
    y_pred =  sc.inverse_transform(y_pred)
    print(model_name, " after rescale Test score: ", gini_normalized(y_test, y_pred))
    y_pred = model.predict(X_train)
    y_train['Hazard'] =  sc.inverse_transform(y_train['Hazard'].values)
    y_pred =  sc.inverse_transform(y_pred)
    print(model_name, " after rescale Train score: ", gini_normalized(y_train, y_pred))

GradientBoostingRegressor and LinearRegression performed the best, We will use these 2 models. Now we will use whole train data to train and test data to predict.

Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()

for col in df_numerical_cols:
    X[col] =  sc.fit_transform(X[col].values.reshape(-1,1))
    X_df_test[col] =  sc.transform(X_df_test[col].values.reshape(-1,1))
y['Hazard'] =  sc.fit_transform(y['Hazard'].values.reshape(-1,1))
# last object is fitted on y, this will be used to inverse transform y_pred

In [None]:
model = GradientBoostingRegressor()
model.fit(X, y)
y_pred = model.predict(X_df_test)
y_pred = sc.inverse_transform(y_pred)
output = pd.DataFrame({'Id': df_test_Id, 'Hazard': y_pred})
output.to_csv('GradientBoostingRegressor10May.csv', index=False)