In [1]:
!ls

Hackathon-Jan'23.ipynb
Problem Statement.docx
data-visualization
sample_submission.csv
sample_submission_9zqft7i.zip
submission_Raj.csv
test_koRSKBP.csv
train_BRCpofr.csv
~$oblem Statement.docx


### 1. Data Loading & managing prerequisites

In [2]:
# General Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

#%matplotlib inline
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv("train_BRCpofr.csv")
train.head()

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,1,Male,Urban,Bachelor,5L-10L,1,5,5790,More than 1,A,Platinum,64308
1,2,Male,Rural,High School,5L-10L,0,8,5080,More than 1,A,Platinum,515400
2,3,Male,Urban,Bachelor,5L-10L,1,8,2599,More than 1,A,Platinum,64212
3,4,Female,Rural,High School,5L-10L,0,7,0,More than 1,A,Platinum,97920
4,5,Male,Urban,High School,More than 10L,1,6,3508,More than 1,A,Gold,59736


## 2. Data Understanding (DS + EDA)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89392 entries, 0 to 89391
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              89392 non-null  int64 
 1   gender          89392 non-null  object
 2   area            89392 non-null  object
 3   qualification   89392 non-null  object
 4   income          89392 non-null  object
 5   marital_status  89392 non-null  int64 
 6   vintage         89392 non-null  int64 
 7   claim_amount    89392 non-null  int64 
 8   num_policies    89392 non-null  object
 9   policy          89392 non-null  object
 10  type_of_policy  89392 non-null  object
 11  cltv            89392 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 8.2+ MB


In [5]:
train.describe()

Unnamed: 0,id,marital_status,vintage,claim_amount,cltv
count,89392.0,89392.0,89392.0,89392.0,89392.0
mean,44696.5,0.575488,4.595669,4351.502416,97952.828978
std,25805.391969,0.494272,2.290446,3262.359775,90613.814793
min,1.0,0.0,0.0,0.0,24828.0
25%,22348.75,0.0,3.0,2406.0,52836.0
50%,44696.5,1.0,5.0,4089.0,66396.0
75%,67044.25,1.0,6.0,6094.0,103440.0
max,89392.0,1.0,8.0,31894.0,724068.0


#### Understanding Categorical columns

In [6]:
num_cols = ["claim_amount","cltv"]
cat_cols = train.columns[~train.columns.isin(num_cols)]

In [7]:
print("############## Checking unique categories per column ############## \n")
for cat in cat_cols:
    print(f"Total unique categories in column {cat:15}: {train[cat].unique()}")
    print(f"Number for samples per category: \n {train[cat].value_counts()}")
    print("--------------------------")

############## Checking unique categories per column ############## 

Total unique categories in column id             : [    1     2     3 ... 89390 89391 89392]
Number for samples per category: 
 2049     1
77135    1
28007    1
25958    1
32101    1
        ..
27288    1
4759     1
6806     1
661      1
2047     1
Name: id, Length: 89392, dtype: int64
--------------------------
Total unique categories in column gender         : ['Male' 'Female']
Number for samples per category: 
 Male      50497
Female    38895
Name: gender, dtype: int64
--------------------------
Total unique categories in column area           : ['Urban' 'Rural']
Number for samples per category: 
 Urban    62455
Rural    26937
Name: area, dtype: int64
--------------------------
Total unique categories in column qualification  : ['Bachelor' 'High School' 'Others']
Number for samples per category: 
 High School    46247
Bachelor       39399
Others          3746
Name: qualification, dtype: int64
---------------------

## 3. Feature Engineering

In [8]:
train = train.drop('id', axis=1)

#### Missing values impution

In [9]:
train.isna().sum()

gender            0
area              0
qualification     0
income            0
marital_status    0
vintage           0
claim_amount      0
num_policies      0
policy            0
type_of_policy    0
cltv              0
dtype: int64

In [10]:
# Ordinal Categorical columns: categorical and ordered
ord_cat_col = ['vintage', 'income', 'type_of_policy']
# Nominal Categorical columns: categorical and not ordered
nom_cat_col = cat_cols[~cat_cols.isin(ord_cat_col)].to_list()

In [11]:
print(f"Numerical Columns: {num_cols}")
print(f"Categorical Columns: {cat_cols.tolist()}")
print(f"Ordinal Columns: {ord_cat_col}")
print(f"Nominal Columns: {nom_cat_col}")

Numerical Columns: ['claim_amount', 'cltv']
Categorical Columns: ['id', 'gender', 'area', 'qualification', 'income', 'marital_status', 'vintage', 'num_policies', 'policy', 'type_of_policy']
Ordinal Columns: ['vintage', 'income', 'type_of_policy']
Nominal Columns: ['id', 'gender', 'area', 'qualification', 'marital_status', 'num_policies', 'policy']


**Common encoders supported by sklearn:**
- Ordinal Encoder for ordered categorical columns
- One-hot Encoder for non-ordered categorical columns
- Label Encoder for encoding output labels

**Components of Pipeline:**

    [x]Imputation
    [x]Encoding
    [x]Scaling
    [x]Feature Selection
    [x]Model Training

In [12]:
y_train = train.pop('cltv')
x_train = train

### 3.1. Missing value imputation: SimpleImputer

In [13]:
tnfObj1 = ColumnTransformer([
    ('impute_numirical', SimpleImputer(strategy='mean'), [6]), # For Numerical columns
    ('impute_categorical', SimpleImputer(strategy='most_frequent'), [0,1,2,3,4,5,7,8,9]) # For categorical columns
],remainder='passthrough')

### 3.2. Encoding categorical columns: OrdinalEncoder | OneHotEncoder

In [14]:
tnfObj2 = ColumnTransformer([
    ('Ordinal_Encoder', OrdinalEncoder([['<=2L', '2L-5L', '5L-10L', 'More than 10L'], ['Silver', 'Gold', 'Platinum']]),[4,9]),
    ('OneHot_Encoder', OneHotEncoder(sparse=False, drop='if_binary'), [1,2,3,8,7])
],remainder='passthrough')

### 3.3. Scaling Regressors: MinMaxScaler 

In [15]:
tnfObj3 = ColumnTransformer([
    ('MinMax_Scaler',MinMaxScaler(),slice(0,14))], remainder='passthrough'
)

### 3.4. Feature Selecion

In [16]:
tnfObj4 = SelectKBest(score_func=chi2,k=10)

## 4. Model Selecion

In [17]:
model = input("Choose model: \n 1. Linear Regression \n 2. RandomForestRegressor \n")
if model == '1':
    tnfObj5 = LinearRegression()
elif model == '2':
    tnfObj5 = RandomForestRegressor()

Choose model: 
 1. Linear Regression 
 2. RandomForestRegressor 
2


## 5. Creating and training Pipeline

In [18]:
pipe = Pipeline([
    ('tnfObj1',tnfObj1),
    ('tnfObj2',tnfObj2),
    ('tnfObj3',tnfObj3),
    ('tnfObj4',tnfObj4),
    ('tnfObj5',tnfObj5)
])

In [19]:
from sklearn import set_config
set_config(display="diagram")

In [20]:
pipe.fit(x_train,y_train)

In [21]:
pipe.named_steps['tnfObj1']

## 6. Performing prediction on test dataset

In [62]:
test = pd.read_csv("test_koRSKBP.csv")
test.head()

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy
0,89393,Female,Rural,High School,5L-10L,0,6,2134,More than 1,B,Silver
1,89394,Female,Urban,High School,2L-5L,0,4,4102,More than 1,A,Platinum
2,89395,Male,Rural,High School,5L-10L,1,7,2925,More than 1,B,Gold
3,89396,Female,Rural,Bachelor,More than 10L,1,2,0,More than 1,B,Silver
4,89397,Female,Urban,High School,2L-5L,0,5,14059,More than 1,B,Silver


In [63]:
# Extracting test sample labels
test_id = test.pop('id')
test_id = pd.DataFrame(test_id)
x_test = test

In [65]:
x_test.head()

Unnamed: 0,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy
0,Female,Rural,High School,5L-10L,0,6,2134,More than 1,B,Silver
1,Female,Urban,High School,2L-5L,0,4,4102,More than 1,A,Platinum
2,Male,Rural,High School,5L-10L,1,7,2925,More than 1,B,Gold
3,Female,Rural,Bachelor,More than 10L,1,2,0,More than 1,B,Silver
4,Female,Urban,High School,2L-5L,0,5,14059,More than 1,B,Silver


In [66]:
y_pred = pipe.predict(x_test)
y_pred = pd.DataFrame(y_pred,columns=["y_pred"])

In [67]:
result = pd.concat([test_id,y_pred], axis=1)
result.head()

Unnamed: 0,id,y_pred
0,89393,100088.020031
1,89394,132983.969211
2,89395,88050.889541
3,89396,95072.40871
4,89397,119215.914603


In [68]:
result.to_csv('submission_Raj.csv', header=['id','cltv'], index=False)

##### Complete!!