In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# To impute missing Values
from sklearn.impute import SimpleImputer

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [None]:
# prepare dataframe for modeling
X = train.drop(columns=['id','claim']).copy()
y = train['claim'].values.flatten()

test_data = test.drop(columns=['id']).copy()

In [None]:
y.shape

In [None]:
# feature Engineering
def get_stats_per_row(data):
    data['mv_row'] = data.isna().sum(axis=1)
    data['min_row'] = data.min(axis=1)
    data['std_row'] = data.std(axis=1)
    return data
X = get_stats_per_row(X)
test_data = get_stats_per_row(test_data)

In [None]:
# create preprocessing pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

X = pd.DataFrame(columns=X.columns, data=pipeline.fit_transform(X))
test_data = pd.DataFrame(columns=test_data.columns, data=pipeline.transform(test_data))

In [None]:
#feature selection

from sklearn.feature_selection import SelectFromModel

col_names = X.columns

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier()

In [None]:
sfm = SelectFromModel(estimator=model)
X_transformed = sfm.fit_transform(X, y)

X_transformed

In [None]:
support = sfm.get_support()
features_selected = ([x for x, y in zip(col_names, support) if y == True])

features_selected

In [None]:
X = X[['f1', 'f2', 'f3', 'f5', 'f6', 'f7', 'f8', 'f9', 'f12', 'f16', 'f21', 'f24', 'f25', 'f28', 'f31', 'f32', 'f34', 'f35', 'f36', 'f38', 'f40', 'f45', 'f47', 'f48', 'f50', 'f52', 'f57', 'f61', 'f62', 'f65', 'f70', 'f71', 'f77', 'f78', 'f79', 'f86', 'f92', 'f95', 'f96', 'f99', 'f102', 'f106', 'f107', 'mv_row']]

In [None]:
test = test_data[['f1', 'f2', 'f3', 'f5', 'f6', 'f7', 'f8', 'f9', 'f12', 'f16', 'f21', 'f24', 'f25', 'f28', 'f31', 'f32', 'f34', 'f35', 'f36', 'f38', 'f40', 'f45', 'f47', 'f48', 'f50', 'f52', 'f57', 'f61', 'f62', 'f65', 'f70', 'f71', 'f77', 'f78', 'f79', 'f86', 'f92', 'f95', 'f96', 'f99', 'f102', 'f106', 'f107', 'mv_row']]

In [None]:
# setting up the chart size and background
plt.rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')

# for Interactive Shells
from IPython.display import display

In [None]:
#checking for outliers
fig, ax = plt.subplots(4, 1, figsize = (25,25))
sns.boxplot(data = X.iloc[:, 1:10], ax = ax[0])
sns.boxplot(data = X.iloc[:, 10:20], ax = ax[1])
sns.boxplot(data = X.iloc[:, 20:30], ax = ax[2])
sns.boxplot(data = X.iloc[:, 30:45], ax = ax[3])


In [None]:
#handling outliers
tenth_percentile = np.percentile(X, 10)
ninetieth_percentile = np.percentile(X, 90)
# print(tenth_percentile, ninetieth_percentile)
b = np.where(X<tenth_percentile, tenth_percentile, X)
b = np.where(b>ninetieth_percentile, ninetieth_percentile, b)
# print("Sample:", sample)
print("New array:",b)

In [None]:
X = pd.DataFrame(data = b, columns= X.columns)

In [None]:
#outlier is handled
fig, ax = plt.subplots(4, 1, figsize = (25,25))
sns.boxplot(data = X.iloc[:, 1:10], ax = ax[0])
sns.boxplot(data = X.iloc[:, 10:20], ax = ax[1])
sns.boxplot(data = X.iloc[:, 20:30], ax = ax[2])
sns.boxplot(data = X.iloc[:, 30:45], ax = ax[3])

In [None]:
#same with test data
fig, ax = plt.subplots(4, 1, figsize = (25,25))
sns.boxplot(data = test.iloc[:, 1:10], ax = ax[0])
sns.boxplot(data = test.iloc[:, 10:20], ax = ax[1])
sns.boxplot(data = test.iloc[:, 20:30], ax = ax[2])
sns.boxplot(data = test.iloc[:, 30:45], ax = ax[3])

In [None]:
tenth_percentile = np.percentile(test, 10)
ninetieth_percentile = np.percentile(test, 90)
# print(tenth_percentile, ninetieth_percentile)
c = np.where(test<tenth_percentile, tenth_percentile, test)
c = np.where(c>ninetieth_percentile, ninetieth_percentile, c)
# print("Sample:", sample)
print("New array:",c)

In [None]:
test = pd.DataFrame(data = c, columns= test.columns)

In [None]:
fig, ax = plt.subplots(4, 1, figsize = (25,25))
sns.boxplot(data = test.iloc[:, 1:10], ax = ax[0])
sns.boxplot(data = test.iloc[:, 10:20], ax = ax[1])
sns.boxplot(data = test.iloc[:, 20:30], ax = ax[2])
sns.boxplot(data = test.iloc[:, 30:45], ax = ax[3])

In [None]:
#distribuation of label

nrows = 11
ncols = 4
i = 0
fig, ax = plt.subplots(nrows, ncols, figsize = (25,75))
for row in range(nrows):
    for col in range(ncols):
        sns.histplot(data = X.iloc[:, i], bins = 50, ax = ax[row, col], palette  = 'Set1').set(ylabel = '')
        i += 1

In [None]:
######################################################################
#### SPLITTING  TWICE! Here I will create TRAIN | VALIDATION | TEST  #########
####################################################################
from sklearn.model_selection import train_test_split

# 70% of data is training data, set aside other 30%
X_train, X_OTHER, y_train, y_OTHER = train_test_split(X, y, test_size=0.3, random_state=101)

# Remaining 30% is split into evaluation and test sets
# Each is 15% of the original data size
X_eval, X_test, y_eval, y_test = train_test_split(X_OTHER, y_OTHER, test_size=0.5, random_state=101)

In [None]:
from sklearn.metrics import roc_curve, auc
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
model = LGBMClassifier()
model.fit(X_train, y_train)

In [None]:
y_eval_pred = model.predict_proba(X_eval)

In [None]:
y_eval_pred

In [None]:
#calculation roc_auc_score
from sklearn.metrics import roc_auc_score

In [None]:
print(roc_auc_score(y_eval,y_eval_pred[:,1]))

In [None]:
#calculating auc score
from sklearn.metrics import auc, roc_curve

fpr, tpr, thresholds = roc_curve(y_eval, y_eval_pred[:,1], pos_label = 1)
auc(fpr, tpr)

In [None]:
#final prediction with the truly unseen data
y_final_test_pred = model.predict_proba(X_test)

In [None]:
print(roc_auc_score(y_test,y_final_test_pred[:,1]))

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - LGBClassifier')
plt.plot(fpr, tpr)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
test11 = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [None]:
test_feat = test

test_id = test11['id']

In [None]:
pred = model.predict_proba(test_feat) 

In [None]:
submission = pd.DataFrame({'id' : test_id, 'claim': pred[:,1]})
submission.to_csv( 'submission.csv' ,index = 0)
submission.head()