# Packages

In [None]:
# Data wrangling
import pandas as pd 
import numpy as np 

# Preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import LocalOutlierFactor

# Model 
from sklearn.tree import DecisionTreeClassifier

# Evaluation metrics
from sklearn.metrics import roc_auc_score 

# Import data

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
df_train.head()

In [None]:
df_train.describe()

# Preprocessing

In [None]:
# Defining target and explanatory variables 
y = df_train["claim"]
X = df_train.drop(columns=["claim", "id"])

# Train-test-split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Impute missing values 
imputer = SimpleImputer()
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X.columns, index=X_train.index)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X.columns, index=X_test.index)

In [None]:
# Outliers detection
lof = LocalOutlierFactor(n_neighbors=5)
yhat = pd.DataFrame(lof.fit_predict(X_train), columns=["outliers_d"], index=X_train.index)
outliers_index = yhat[yhat["outliers_d"]==-1].index
outliers_index

In [None]:
# Removal of outliers 
X_train.drop(outliers_index, inplace=True)
y_train.drop(outliers_index, inplace=True)

# Model Development

In [None]:
# Initial model 
model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = roc_auc_score(y_test, predictions)
print("Score: {0:0.5f}".format(score))

In [None]:
# 1st model improvement 
for max_depth in range(1, 11, 1): 
    model_1 = DecisionTreeClassifier(max_depth=max_depth, random_state=0)
    model_1.fit(X_train, y_train)
    predictions_1 = model_1.predict(X_test)
    score_1 = roc_auc_score(y_test, predictions_1)
    print("max_depth: {0}, score: {1:0.5f}".format(max_depth, score_1))

Optimal max_depth = 10

In [None]:
# 2nd model improvement
for min_samples_leaf in range(1, 11, 1): 
    model_2 = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, max_depth=10, random_state=0)
    model_2.fit(X_train, y_train)
    predictions_2 = model_2.predict(X_test)
    score_2 = roc_auc_score(y_test, predictions_2)
    print("min_samples_leaf: {0}, score: {1:0.5f}".format(min_samples_leaf, score_2))

Optimal min_samples_leaf = 5

# Submission

## Preprocessing

In [None]:
# Import test data
test_data = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
test_data.head()

In [None]:
# Impute missing values 
test = test_data.drop(columns=["id"])
test = pd.DataFrame(imputer.transform(test), columns=test.columns, index=test.index)

## Predictions

In [None]:
# Model development 
DTC_model = DecisionTreeClassifier(min_samples_leaf=5, max_depth=10, random_state=0)
DTC_model.fit(X_train, y_train)
DTC_predictions = DTC_model.predict(test)

In [None]:
# Export results 
output = pd.DataFrame({"id": test_data["id"], 
                      "claim": DTC_predictions})
output.to_csv("submission.csv", index=False)