In [None]:
import numpy as np
import pandas as pd

In [None]:
import random
import gc
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,RocCurveDisplay,ConfusionMatrixDisplay,confusion_matrix,roc_auc_score,accuracy_score

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import preprocessing
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv')

y = train['target']

# Delete column id
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)


In [None]:
train.head()

In [None]:
test.head()

In [None]:
cols = ['f'+str(i) for i in range(100)]

In [None]:
# plot the first 24 features 
#i = 1
#plt.figure()
#fig, ax = plt.subplots(6, 4,figsize=(20, 22))
#for feature in cols[:24]:
#    plt.subplot(6, 4,i)
#    sns.histplot(train[feature],color="blue", kde=True,bins=100, label='train_'+feature)
#    sns.histplot(test[feature],color="olive", kde=True,bins=100, label='test_'+feature)
#    plt.xlabel(feature, fontsize=9); plt.legend()
#    i += 1
#plt.show()

In [None]:
#sns.catplot(x="target", kind="count", palette="ch:.25", data=train)

In [None]:
## Target distibution
#pie, ax = plt.subplots(figsize=[18,8])
#train.groupby('target').size().plot(kind='pie',autopct='%.1f',ax=ax,title='Target distibution')

# Modeling

In [None]:
# apply standard scaler to the data
scaler = StandardScaler()
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

In [None]:
preds = np.zeros(test.shape[0])
kf = StratifiedKFold(n_splits = 10,random_state = 4042,shuffle = True)

auc=[]  
acc=[]
n=0

for trn_idx, test_idx in kf.split(train[cols],y):
    X_tr,X_val = train[cols].iloc[trn_idx],train[cols].iloc[test_idx]
    y_tr,y_val = y.iloc[trn_idx],y.iloc[test_idx]
    
    model = LogisticRegression(solver='saga', penalty='l1', max_iter=200, C=0.5)
    model.fit(X_tr,y_tr)
    preds += model.predict_proba(test[cols])[:,1]/kf.n_splits
    
    auc.append(roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))
    acc.append(accuracy_score(y_val, model.predict(X_val)))

    print(f"fold: {n+1} , accuracy: {round(acc[n]*100,3)} , auc: {round(auc[n]*100,3)}")
    n+=1  

In [None]:
print(f"the mean AUC is : {round(np.mean(auc)*100,2)} while the mean Accuracy is : {round(np.mean(acc)*100,2)} ")

In [None]:
feature_importance = abs(model.coef_[0])
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)[:30]
pos = np.arange(sorted_idx.shape[0]) + .5

featfig = plt.figure()
featax = featfig.add_subplot(1, 1, 1)
featax.barh(pos, feature_importance[sorted_idx], align='center')
featax.set_yticks(pos)
featax.set_yticklabels(np.array(train.columns)[sorted_idx], fontsize=14)
featax.set_xlabel('Relative Feature Importance')

In [None]:
# Plot of confusion matrix for the last fold
cm = confusion_matrix(model.predict(X_val),y_val)
cm_display = ConfusionMatrixDisplay(cm).plot()

In [None]:
y_pred_proba = model.predict_proba(X_val)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_val,  y_pred_proba)
auc = metrics.roc_auc_score(y_val, y_pred_proba)
plt.plot(fpr,tpr,label="data, auc for fold 10="+str(round(auc*100,2)))
plt.legend(loc=4)
plt.show()

In [None]:
sub['target']=preds
sub.to_csv('submission.csv', index=False)