In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Data

In [None]:
train_df=pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test_df=pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [None]:
train_df1=train_df.copy()
test_df1=test_df.copy()

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df

In [None]:
test_df

### Checking for null values

In [None]:
train_df.isnull().sum()

In [None]:
(train_df.isnull().sum()/train_df.shape[0])*100

In [None]:
test_df.isnull().sum()

In [None]:
(test_df.isnull().sum()/test_df.shape[0])*100

In [None]:
train_df.fillna(value=train_df.mean(),inplace=True)
test_df.fillna(value=test_df.mean(),inplace=True)

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
train_df.drop('id',axis=1,inplace=True)
train_df

In [None]:
test_df.drop('id',axis=1,inplace=True)
test_df

# Feature Selection.

In [None]:
from sklearn.feature_selection import SelectKBest

In [None]:
X=train_df.drop('claim',axis=1)
Y=train_df['claim']

In [None]:
imp_features=SelectKBest()
imp_features.fit(X,Y)

In [None]:
imp_features.scores_.shape

In [None]:
features=pd.DataFrame(imp_features.scores_,X.columns)
features.columns=['feature_score']
features

In [None]:
features=features.sort_values(by=['feature_score'], ascending=False)
features

In [None]:
top_features=features[features['feature_score']>15]
top_features

In [None]:
len(top_features)

In [None]:
list_features=top_features.index
print(list_features)

In [None]:
len(list_features)

In [None]:
train_df=train_df[list_features]
train_df=pd.concat([train_df,train_df1['claim']],axis=1)
train_df

In [None]:
train_df.shape

In [None]:
test_df=test_df[list_features]
test_df

In [None]:
test_df.shape

In [None]:
test1_df=test_df.copy()

In [None]:
train1_df=train_df.copy()

# Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()

In [None]:
scaled_train=ss.fit_transform(train_df.drop('claim',axis=1))
scaled_train

In [None]:
train_df_s=pd.DataFrame(scaled_train,columns=test_df.columns)
train_df_s

In [None]:
train_df=pd.concat([train_df_s,train_df1['claim']],axis=1)
train_df

In [None]:
scaled_test=ss.transform(test_df)
scaled_test

In [None]:
test_df=pd.DataFrame(scaled_test,columns=test_df.columns)
test_df

# Model Building

### Logistic Regression

In [None]:
X=train_df.drop('claim',axis=1)
Y=train_df['claim']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=.2,random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score
lr=LogisticRegression()
lr.fit(X_train,Y_train)

In [None]:
y_pred_lr=lr.predict(X_test)

In [None]:
print('accuracy_score:{}'.format(accuracy_score(Y_test,y_pred_lr)))
print('-'*75)
print('f1_score:{}'.format(f1_score(Y_test,y_pred_lr)))
print('-'*75)
print('roc_auc_score:{}'.format(roc_auc_score(Y_test,y_pred_lr)))

### Decision Tree Classifier

In [None]:
x=train1_df.drop('claim',axis=1)
y=train1_df['claim']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(x_train,y_train)

In [None]:
y_pred_dt=dt.predict(x_test)

In [None]:
print('accuracy_score:{}'.format(accuracy_score(y_test,y_pred_dt)))
print('-'*75)
print('f1_score:{}'.format(f1_score(y_test,y_pred_dt)))
print('-'*75)
print('roc_auc_score:{}'.format(roc_auc_score(y_test,y_pred_dt)))

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

max_features=['auto', 'sqrt','log2']

max_depth = [int(x) for x in np.linspace(10, 1000,10)]

min_samples_split = [2, 5, 10,14]

min_samples_leaf = [1, 2, 4,6,8]

random_grid={'max_features':max_features,
            'max_depth':max_depth,
            'min_samples_split':min_samples_split,
            'min_samples_leaf':min_samples_leaf,
            'criterion':['gini','entropy']}

In [None]:
print(random_grid)

In [None]:
dt_randomized=RandomizedSearchCV(estimator=dt,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)

In [None]:
dt_randomized.fit(x_train,y_train)

In [None]:
dt_randomized.best_params_

In [None]:
dt_final=dt_randomized.best_estimator_

In [None]:
y_pred_dt_tuned=dt_final.predict(x_test)

In [None]:
print('accuracy_score:{}'.format(accuracy_score(y_test,y_pred_dt_tuned)))
print('-'*75)
print('f1_score:{}'.format(f1_score(y_test,y_pred_dt_tuned)))
print('-'*75)
print('roc_auc_score:{}'.format(roc_auc_score(y_test,y_pred_dt_tuned)))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)

In [None]:
y_pred_rf=rf.predict(x_test)


In [None]:
print('accuracy_score:{}'.format(accuracy_score(y_test,y_pred_rf)))
print('-'*75)
print('f1_score:{}'.format(f1_score(y_test,y_pred_rf)))
print('-'*75)
print('roc_auc_score:{}'.format(roc_auc_score(y_test,y_pred_rf)))

In [None]:
predictions=rf.predict(test_df)

In [None]:
output = pd.DataFrame({'id': test_df1.id,'claim':predictions})
output.to_csv('submission_tabular_sept.csv', index=False)
print("Your submission was successfully saved!")