In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.inspection import permutation_importance

from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

import tensorflow as tf
from tensorflow import keras


from catboost import CatBoostClassifier

import xgboost


2024-07-01 10:35:35.793047: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-01 10:35:35.793218: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-01 10:35:35.942520: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
SOURCE_DIR = '/kaggle/input/playground-series-s4e7'

In [3]:
df_train = pd.read_csv(os.path.join(SOURCE_DIR, "train.csv"))
df_test = pd.read_csv(os.path.join(SOURCE_DIR, "test.csv"))
df_sub = pd.read_csv(os.path.join(SOURCE_DIR, "sample_submission.csv"))
df_train.drop(columns=['id'], inplace=True)
df_test.drop(columns=['id'], inplace=True)
print(df_train.shape)
df_train.head(5)

(11504798, 11)


Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [4]:
print(df_test.shape)
df_test.head(5)

(7669866, 10)


Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,Female,20,1,47.0,0,< 1 Year,No,2630.0,160.0,228
1,Male,47,1,28.0,0,1-2 Year,Yes,37483.0,124.0,123
2,Male,47,1,43.0,0,1-2 Year,Yes,2630.0,26.0,271
3,Female,22,1,47.0,1,< 1 Year,No,24502.0,152.0,115
4,Male,51,1,19.0,0,1-2 Year,No,34115.0,124.0,148


In [5]:
print(df_sub.shape)
df_sub.head(5)

(7669866, 2)


Unnamed: 0,id,Response
0,11504798,0.5
1,11504799,0.5
2,11504800,0.5
3,11504801,0.5
4,11504802,0.5


In [6]:
df_train.isna().sum()

Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [7]:
df_test.isna().sum()

Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
dtype: int64

In [8]:
df_train.dtypes

Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [9]:
df = pd.concat([df_train.drop(columns=['Response']), df_test])

In [10]:
df.head(2)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187
1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288


In [11]:
feature_cols = df.columns.tolist()
feature_cols

['Gender',
 'Age',
 'Driving_License',
 'Region_Code',
 'Previously_Insured',
 'Vehicle_Age',
 'Vehicle_Damage',
 'Annual_Premium',
 'Policy_Sales_Channel',
 'Vintage']

In [12]:
enc_gender = LabelEncoder()
enc_gender.fit(df['Gender'])

enc_vehicle_age = LabelEncoder()
enc_vehicle_age.fit(df['Vehicle_Age'])

enc_vehicle_damage = LabelEncoder()
enc_vehicle_damage.fit(df['Vehicle_Damage'])

In [13]:
df['Gender'] = enc_gender.transform(df['Gender'])
df['Vehicle_Age'] = enc_vehicle_age.transform(df['Vehicle_Age'])
df['Vehicle_Damage'] = enc_vehicle_damage.transform(df['Vehicle_Damage'])

df_train['Gender'] = enc_gender.transform(df_train['Gender'])
df_train['Vehicle_Age'] = enc_vehicle_age.transform(df_train['Vehicle_Age'])
df_train['Vehicle_Damage'] = enc_vehicle_damage.transform(df_train['Vehicle_Damage'])

df_test['Gender'] = enc_gender.transform(df_test['Gender'])
df_test['Vehicle_Age'] = enc_vehicle_age.transform(df_test['Vehicle_Age'])
df_test['Vehicle_Damage'] = enc_vehicle_damage.transform(df_test['Vehicle_Damage'])

In [14]:
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(columns=['Response']), df_train['Response'].tolist(), test_size=0.25, stratify=df_train['Response'].tolist(),  random_state=42)

In [15]:
scaler = StandardScaler()
scaler.fit(df)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [16]:
enc = LabelEncoder()
enc.fit(y_train)

y_train = enc.transform(y_train)
y_val = enc.transform(y_val)

In [17]:
def get_features_importance(m,X,y,cols):
    # Calculate permutation feature importance
    result = permutation_importance(
        m, X, y, scoring='neg_log_loss', n_repeats=10, random_state=42
    )

    # Get the feature importances and sort them in descending order
    feature_importances = pd.Series(result.importances_mean, index=cols).sort_values(ascending=False)

    # Print the feature importances
    print(feature_importances)

In [18]:
lrc = LogisticRegression()
param_grid={ "penalty":["l1","l2"]}

rf_RandomGrid = RandomizedSearchCV(estimator = lrc, param_distributions = param_grid, cv = 5)
rf_RandomGrid.fit(X_train, y_train)

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/conda/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



In [19]:
rf_RandomGrid.best_params_, rf_RandomGrid.best_score_

({'penalty': 'l2'}, 0.8769984416938261)

In [20]:
lrc = LogisticRegression(**rf_RandomGrid.best_params_)
lrc.fit(X_train, y_train)

In [21]:
print (f'Train Accuracy - : {lrc.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {lrc.score(X_val,y_val):.3f}')

Train Accuracy - : 0.877
Test Accuracy - : 0.877


In [22]:
scores = cross_val_score(lrc,X_train, y_train,cv=10)
print(scores)
print(np.std(scores))
print(np.mean(scores))

[0.87699627 0.8770009  0.87699627 0.87699975 0.8770009  0.87699279
 0.87700206 0.87699743 0.87699728 0.87700076]
2.7579041391778047e-06
0.8769984416936752


In [23]:
get_features_importance(lrc,X_train,y_train,feature_cols)

Previously_Insured      0.189420
Vehicle_Damage          0.080745
Policy_Sales_Channel    0.004595
Vehicle_Age             0.001212
Age                     0.001007
Gender                  0.000380
Driving_License         0.000242
Annual_Premium          0.000127
Vintage                 0.000066
Region_Code             0.000004
dtype: float64


In [24]:
Xt = scaler.transform(df_test)

res = lrc.predict_proba(Xt)

In [25]:
df_sub['Response'] = res[:, 1]
df_sub.to_csv("submission.csv", index=False)