# Packages

In [None]:
!pip install optbinning

In [None]:
# Data wrangling
import pandas as pd 
import numpy as np 

# Preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import LocalOutlierFactor
from optbinning import OptimalBinning

# Model 
from sklearn.linear_model import LogisticRegression

# Evaluation metrics
from sklearn.metrics import roc_auc_score 

# Import Data

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
df_train.head()

In [None]:
# Inspect data
df_train.describe()

# Data Preprocessing

In [None]:
# Defining target and explanatory variables 
y = df_train["claim"]
X = df_train.drop(columns=["claim", "id"])

# Train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Weight of Evidence and Information Value


In [None]:
# Create woe dataset
X_train_woe = pd.DataFrame()
X_test_woe = pd.DataFrame()

# Creating information value table 
X_cols = X_train.columns.to_list()

for col in X_cols: 
    variable = col
    x = X_train[variable].values
    
    # Configuring optimal bins
    opt_bin = OptimalBinning(name=variable, dtype="numerical")
    opt_bin.fit(x, y_train)
    
    # Optbin transforming
    X_train_woe = pd.concat([pd.DataFrame(opt_bin.transform(X_train[col], metric="woe"), columns=[col]), X_train_woe], axis=1)
    X_test_woe = pd.concat([pd.DataFrame(opt_bin.transform(X_test[col], metric="woe"), columns=[col]), X_test_woe], axis=1)

X_train_woe = X_train_woe[X_cols]
X_test_woe = X_test_woe[X_cols]
X_train_woe.head()

In [None]:
X_test_woe.head()

# Model Development

In [None]:
# Initial Model
model = LogisticRegression(random_state=0, solver="liblinear")
model.fit(X_train_woe, y_train)
predictions = model.predict(X_test_woe)
roc_score = roc_auc_score(y_test, predictions)
print("ROC Score: {0:0.3f}".format(roc_score))

In [None]:
# 1st Model Improvement 
model_1 = LogisticRegression(penalty = "l1", solver="liblinear", random_state=0)
model_1.fit(X_train_woe, y_train)
predictions_1 = model_1.predict(X_test_woe)
roc_score_1 = roc_auc_score(y_test, predictions_1)
print("ROC Score: {0:0.3f}".format(roc_score_1))

In [None]:
# 2nd model improvement 
for alpha in range(1, 11, 1): 
    alpha = alpha / 10
    model_2 = LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=alpha, random_state=0)
    model_2.fit(X_train_woe, y_train)
    predictions_2 = model_2.predict(X_test_woe)
    roc_score_2 = roc_auc_score(y_test, predictions_2)
    print("L1: {0}, ROC Score: {1:0.3f}".format(alpha, roc_score_2))

## Submission

### Preprocessing

In [None]:
# Import test data
test_data = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
test_data.head()

#### Weight of Evidence and Information Value

In [None]:
test = test_data.set_index("id")
X_cols = X_train.columns.to_list()
test_woe = pd.DataFrame()

for col in X_cols: 
    variable = col
    x = X_train[variable].values
    
    # Configuring optimal bins
    opt_bin = OptimalBinning(name=variable, dtype="numerical")
    opt_bin.fit(x, y_train)
    
    # Optbin transforming
    test_woe = pd.concat([pd.DataFrame(opt_bin.transform(test[col], metric="woe"), columns=[col]), test_woe], axis=1)

test_woe = test_woe[X_cols]

#### Model Development 

In [None]:
LR_model = LogisticRegression(penalty="l1", solver="liblinear", random_state=0)
LR_model.fit(X_train_woe, y_train)
LR_predictions = LR_model.predict(test_woe)

In [None]:
# Export model 
output = pd.DataFrame({"id": test_data["id"], 
                      "claim": LR_predictions})
output.to_csv("submission.csv", index=False)