In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# To impute missing Values
from sklearn.impute import SimpleImputer

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [None]:
# prepare dataframe for modeling
X = train.drop(columns=['id','claim']).copy()
y = train['claim'].copy()

test_data = test.drop(columns=['id']).copy()

In [None]:
# feature Engineering
def get_stats_per_row(data):
    data['mv_row'] = data.isna().sum(axis=1)
    data['min_row'] = data.min(axis=1)
    data['std_row'] = data.std(axis=1)
    return data
X = get_stats_per_row(X)
test_data = get_stats_per_row(test_data)

In [None]:
from tqdm import tqdm

features = [x for x in X.columns.values if x[0]=="f"]

In [None]:
fill_value_dict = {
    'f1': 'Mean', 
    'f2': 'Mode', 
    'f3': 'Median', 
    'f4': 'Median', 
    'f5': 'Mode', 
    'f6': 'Mean', 
    'f7': 'Median', 
    'f8': 'Median', 
    'f9': 'Median', 
    'f10': 'Median', 
    'f11': 'Mode', 
    'f12': 'Median', 
    'f13': 'Mode', 
    'f14': 'Median', 
    'f15': 'Mean', 
    'f16': 'Median', 
    'f17': 'Mean', 
    'f18': 'Median', 
    'f19': 'Median', 
    'f20': 'Median', 
    'f21': 'Median', 
    'f22': 'Mean', 
    'f23': 'Mode', 
    'f24': 'Median', 
    'f25': 'Median', 
    'f26': 'Median', 
    'f27': 'Median', 
    'f28': 'Median', 
    'f29': 'Mode', 
    'f30': 'Median', 
    'f31': 'Mode', 
    'f32': 'Median', 
    'f33': 'Median', 
    'f34': 'Mean', 
    'f35': 'Median', 
    'f36': 'Mean', 
    'f37': 'Median', 
    'f38': 'Median', 
    'f39': 'Median', 
    'f40': 'Mode', 
    'f41': 'Median', 
    'f42': 'Mode', 
    'f43': 'Mean', 
    'f44': 'Median', 
    'f45': 'Median', 
    'f46': 'Mean', 
    'f47': 'Mode', 
    'f48': 'Mean', 
    'f49': 'Mode', 
    'f50': 'Mode', 
    'f51': 'Median', 
    'f52': 'Median', 
    'f53': 'Median', 
    'f54': 'Mean', 
    'f55': 'Median', 
    'f56': 'Median', 
    'f57': 'Mean', 
    'f58': 'Mode', 
    'f59': 'Median', 
    'f60': 'Median', 
    'f61': 'Mode', 
    'f62': 'Median', 
    'f63': 'Median', 
    'f64': 'Median', 
    'f65': 'Mode', 
    'f66': 'Mode', 
    'f67': 'Median', 
    'f68': 'Median', 
    'f69': 'Median', 
    'f70': 'Mode', 
    'f71': 'Median', 
    'f72': 'Median', 
    'f73': 'Median', 
    'f74': 'Mode', 
    'f75': 'Mode', 
    'f76': 'Mean', 
    'f77': 'Mode', 
    'f78': 'Median', 
    'f79': 'Mean', 
    'f80': 'Mode', 
    'f81': 'Mode', 
    'f82': 'Median', 
    'f83': 'Mode', 
    'f84': 'Mode', 
    'f85': 'Median', 
    'f86': 'Median', 
    'f87': 'Median', 
    'f88': 'Median', 
    'f89': 'Median', 
    'f90': 'Median', 
    'f91': 'Mode', 
    'f92': 'Median', 
    'f93': 'Median', 
    'f94': 'Mode', 
    'f95': 'Median', 
    'f96': 'Median', 
    'f97': 'Mean', 
    'f98': 'Median', 
    'f99': 'Median', 
    'f100': 'Mode', 
    'f101': 'Median', 
    'f102': 'Median', 
    'f103': 'Mode', 
    'f104': 'Median', 
    'f105': 'Median', 
    'f106': 'Median', 
    'f107': 'Median', 
    'f108': 'Median', 
    'f109': 'Mode', 
    'f110': 'Median', 
    'f111': 'Median', 
    'f112': 'Mode', 
    'f113': 'Median', 
    'f114': 'Median', 
    'f115': 'Median', 
    'f116': 'Mode', 
    'f117': 'Median', 
    'f118': 'Median'
}

for col in tqdm(features):
    if fill_value_dict.get(col)=='Mean':
        fill_value = X[col].mean()
    elif fill_value_dict.get(col)=='Median':
        fill_value = X[col].median()
    elif fill_value_dict.get(col)=='Mode':
        fill_value = X[col].mode().iloc[0]
    
    X[col].fillna(fill_value, inplace=True)
    test_data[col].fillna(fill_value, inplace=True)


In [None]:
from sklearn.feature_selection import SelectFromModel

col_names = X.columns

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier()

sfm = SelectFromModel(estimator=model)
X_transformed = sfm.fit_transform(X, y)

In [None]:
support = sfm.get_support()

features_selected = ([x for x, y in zip(col_names, support) if y == True])

In [None]:
features_selected

In [None]:
X_feat = X[['f1', 'f2', 'f3', 'f5', 'f6', 'f7', 'f8', 'f9', 'f16', 'f21', 'f24', 'f25', 'f27', 'f28', 'f31',
           'f32', 'f34', 'f35', 'f36', 'f38', 'f40', 'f45', 'f47', 'f48', 'f50', 'f52', 'f53', 'f57', 'f61',
            'f62', 'f65', 'f70', 'f71', 'f75', 'f77', 'f79', 'f86', 'f92', 'f95', 'f96', 'f102', 'f103', 'f106', 
            'f107', 'mv_row']]

test = test_data[['f1', 'f2', 'f3', 'f5', 'f6', 'f7', 'f8', 'f9', 'f16', 'f21', 'f24', 'f25', 'f27', 'f28', 'f31',
                 'f32', 'f34', 'f35', 'f36', 'f38', 'f40', 'f45', 'f47', 'f48', 'f50', 'f52', 'f53', 'f57', 'f61',
                 'f62', 'f65', 'f70', 'f71', 'f75', 'f77', 'f79', 'f86', 'f92', 'f95', 'f96', 'f102', 'f103', 'f106', 
                 'f107', 'mv_row']]

X = X_feat

In [None]:
#handling outliers
tenth_percentile = np.percentile(X, 10)
ninetieth_percentile = np.percentile(X, 90)
# print(tenth_percentile, ninetieth_percentile)
b = np.where(X<tenth_percentile, tenth_percentile, X)
b = np.where(b>ninetieth_percentile, ninetieth_percentile, b)
# print("Sample:", sample)
#print("New array:",b)

In [None]:
X = pd.DataFrame(data = b, columns= X.columns)

In [None]:
tenth_percentile = np.percentile(test, 10)
ninetieth_percentile = np.percentile(test, 90)
# print(tenth_percentile, ninetieth_percentile)
c = np.where(test<tenth_percentile, tenth_percentile, test)
c = np.where(c>ninetieth_percentile, ninetieth_percentile, c)
# print("Sample:", sample)
#print("New array:",c)

In [None]:
test = pd.DataFrame(data = c, columns= test.columns)

In [None]:
######################################################################
#### SPLITTING  TWICE! Here I will create TRAIN | VALIDATION | TEST  #########
####################################################################
from sklearn.metrics import roc_curve, auc
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import train_test_split

# 70% of data is training data, set aside other 30%
X_train, X_OTHER, y_train, y_OTHER = train_test_split(X, y, test_size=0.3, random_state=101)

# Remaining 30% is split into evaluation and test sets
# Each is 15% of the original data size
X_eval, X_test, y_eval, y_test = train_test_split(X_OTHER, y_OTHER, test_size=0.5, random_state=101)

In [None]:
model = LGBMClassifier()
model.fit(X_train, y_train)

y_eval_pred = model.predict_proba(X_eval)

y_eval_pred

In [None]:
#calculation roc_auc_score
from sklearn.metrics import roc_auc_score
#calculating auc score
from sklearn.metrics import auc, roc_curve

print(roc_auc_score(y_eval,y_eval_pred[:,1]))

In [None]:
#calculating auc score
from sklearn.metrics import auc, roc_curve

fpr, tpr, thresholds = roc_curve(y_eval, y_eval_pred[:,1], pos_label = 1)
auc(fpr, tpr)

In [None]:
#final prediction with the truly unseen data
y_final_test_pred = model.predict_proba(X_test)

print(roc_auc_score(y_test,y_final_test_pred[:,1]))

In [None]:
plt.subplots(1, figsize=(10,10))
plt.title('Receiver Operating Characteristic - LGBClassifier')
plt.plot(fpr, tpr)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
test11 = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

In [None]:
test_feat = test

test_id = test11['id']

In [None]:
pred = model.predict_proba(test_feat) 

In [None]:
submission = pd.DataFrame({'id' : test_id, 'claim': pred[:,1]})
submission.to_csv( 'submission.csv' ,index = 0)
submission.head()