<a href="https://colab.research.google.com/github/gokulakrishnanbalaji/ProCode-Kaggle/blob/main/Copy_of_ProCode_ML_starter_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Competition website: [Kaggle competition link](https://www.kaggle.com/competitions/playground-series-s4e1/)

In [None]:
! wget -O train.csv https://raw.githubusercontent.com/gokulakrishnanbalaji/ProCode-Kaggle/main/train.csv
! wget -O test.csv https://raw.githubusercontent.com/gokulakrishnanbalaji/ProCode-Kaggle/main/test.csv
! wget -O sample_submission.csv https://raw.githubusercontent.com/gokulakrishnanbalaji/ProCode-Kaggle/main/sample_submission.csv

# Importing Libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Exploration and analysis

In [None]:
# Load the dataset
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

In [None]:
# Explore the first few rows
# train_data.head()
train_data.info()
train_data.describe()

In [None]:
# Shape of df
train_data.shape

In [None]:
# check for null values
train_data.isna().any()

In [None]:
# Identify unwanted column and remove them in both train and test
train_data=train_data.drop(['id','CustomerId','Surname'],axis=1)
y_id = test_data['id']
test_data = test_data.drop(['id','CustomerId','Surname'],axis=1)

In [None]:
# check for duplicates
train_data.duplicated().sum()


In [None]:
# Drop the duplicates
train_data=train_data.drop_duplicates()
train_data.duplicated().sum()

In [None]:
# Separate numerical and categorical data
num_train_data = train_data.select_dtypes(include=np.number)
cat_train_data = train_data.select_dtypes(exclude = np.number)

In [None]:
# check correlation among numerical data cols
corr = num_train_data.corr()
corr


# Data Preprocessing

In [None]:
# Label encode for categorical variable for train and test

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for columns in cat_train_data.columns:
    labelencoder.fit(train_data[columns])
    train_data[columns]=labelencoder.transform(train_data[columns])
    test_data[columns]=labelencoder.transform(test_data[columns])
train_data.head()

# Feature engineering

In [None]:
# Create new col called balance_per_salary
train_data['balance_per_salary']=train_data['Balance']/train_data['EstimatedSalary']
test_data['balance_per_salary']=test_data['Balance']/train_data['EstimatedSalary']
# train_data = train_data.drop(columns=['Balance','EstimatedSalary'])
train_data
test_data

In [None]:
# Scale features in both train and test data

from sklearn.preprocessing import MinMaxScaler
maxminscaler = MinMaxScaler()

for col in train_data.columns:
    if col != 'Exited':
        maxminscaler.fit(train_data[[col]])
        train_data[col] =maxminscaler.transform(train_data[[col]])
        test_data[col] =maxminscaler.transform(test_data[[col]])

In [None]:
# train_data.head()
test_data.head()

# Model Selection

In [None]:
# split data as train and test

from sklearn.model_selection import train_test_split
y=train_data['Exited']
x=train_data.drop(columns=['Exited'])
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.33,random_state=72)
print(train_x.shape)
test_x.shape

In [None]:
# we will train with logistic regression and XGboost

In [None]:
# Use logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(train_x,train_y)
logreg_ypred = logreg.predict(test_x)



In [None]:
# Use xgboost

from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(train_x,train_y)
xgb_ypred = xgb.predict(test_x)


# Model Evaluation

In [None]:
# F1 score for logistic regression

from sklearn.metrics import f1_score
f1_score(test_y,logreg_ypred)


In [None]:
# F1 score for XGBoost

#from sklearn.metrics import f1_score
f1_score(test_y,xgb_ypred)

# Hyperparameter Tuning

In [None]:
#Tweak the parameters n_estimators, max_depth ,learning_rate

from sklearn.model_selection import GridSearchCV
paramdict={'n_estimators':[50,100], 'max_depth':[2,5] ,'learning_rate':[0.1,0.01]}
gridsearch = GridSearchCV(estimator=xgb,param_grid=paramdict,cv=5)
gridsearch.fit(train_x,train_y)
xgb_gsv_ypred=gridsearch.predict(test_x)


In [None]:
# f1 score for the grid_search
f1_score(test_y,xgb_gsv_ypred)


# Performance Report

In [None]:
# plot confusion matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(test_y,xgb_gsv_ypred)


In [None]:
 # ROC Curve

from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(test_y,xgb_gsv_ypred)

# Compute Area Under the ROC Curve (AUC)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Submission

In [None]:
# make predictions using the latest model
y_pred  = gridsearch.predict(test_data)
y_pred

In [None]:
# make submission dataframe
y_pred = pd.DataFrame(y_pred,columns=['Exited'])
y_pred['id']=y_id
y_pred

In [None]:
# convert submission to csv
y_pred.to_csv('submission.csv',index=False)
