In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
def label_encoder(c):
    lc = LabelEncoder()
def age_range(age):
    if age >= 0 and age <= 5:
        return "baby"
    elif age >= 6 and age <= 16:
        return "children"
    elif age >= 17 and age <= 29:
        return "young_adult"
    elif age >= 30 and age <=49:
        return "middle_aged"
    else:
        return "senior"
    return lc.fit_transform(c)
def preprocess(df):
    age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    df.Age = df.Age.fillna(df.Pclass.map(age_map['Age']))
    df.Cabin = df.Cabin.fillna('X').map(lambda x: x[0].strip())
    df.Ticket = df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
    df.Fare = df.Fare.fillna(df.Fare.mean())
    df.Fare = df.Fare.map(lambda i: np.log(i) if i > 0 else 0)
    df.Embarked = df.Embarked.fillna('X')
    df.Name = df.Name.map(lambda x: x.split(',')[0])
    df["age_range"] = df["Age"].apply(age_range)
    df["Family"] = df["SibSp"] + df["Parch"] +1   
    df["Alone"] =  df["Family"].map(lambda x : "Yes" if x == 1 else "No")
    label_cols = ['Name', 'Ticket',"age_range","Alone"]
    onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked']
    numerical_cols = ['SibSp', 'Parch', 'Fare','Survived',"Family"]  
    onehot_encoded_df = pd.get_dummies(df[onehot_cols])
    label_encoded_df = df[label_cols].apply(label_encoder)
    numerical_df = df[numerical_cols]
    return pd.concat([numerical_df,onehot_encoded_df], axis=1)

def test_preprocess(df):
    age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    df.Age = df.Age.fillna(df.Pclass.map(age_map['Age']))
    df.Cabin = df.Cabin.fillna('X').map(lambda x: x[0].strip())
    df.Ticket = df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
    df.Fare = df.Fare.fillna(df.Fare.mean())
    df.Fare = df.Fare.map(lambda i: np.log(i) if i > 0 else 0)
    df.Embarked = df.Embarked.fillna('X')
    df.Name = df.Name.map(lambda x: x.split(',')[0])
    df["age_range"] = df["Age"].apply(age_range)
    df["Family"] = df["SibSp"] + df["Parch"] +1   
    df["Alone"] =  df["Family"].map(lambda x : "Yes" if x == 1 else "No")
    label_cols = ['Name', 'Ticket',"age_range","Alone"]
    onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked']
    numerical_cols = ['SibSp', 'Parch', 'Fare',"Family"]  
    onehot_encoded_df = pd.get_dummies(df[onehot_cols])
    label_encoded_df = df[label_cols].apply(label_encoder)
    numerical_df = df[numerical_cols]
    return pd.concat([numerical_df,onehot_encoded_df], axis=1)

In [None]:
train = preprocess(df = train_df)
train

In [None]:
persudo = pd.read_csv("../input/apr-play-ground-persudo/persudosubmit.csv")

In [None]:
train = preprocess(df = train_df)
test = test_preprocess(df=test_df)
persudo_test = test.copy()
persudo_test['Survived'] = persudo["Survived"]
full_df = train.append(persudo_test)
y = full_df.pop("Survived")
x = full_df.values


In [None]:
def objective(trial):
#     x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.25, random_state=42)
#     param = {
#         'objective': 'binary',
#         'boosting': 'gbdt',
#         'metric': 'auc',
#         'verbose': -1,
#         'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 1),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 2000, 50),
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 512),
#         'max_depth': trial.suggest_int('max_depth', 2, 10),
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#         'max_bin': trial.suggest_int('max_bin', 10, 300, 10),
#     }
    
    
    param = {
        "criterion":trial.suggest_categorical("criterion",["gini", "entropy"]),
        "splitter": trial.suggest_categorical("splitter",["best", "random"]),
        "max_depth":trial.suggest_int("max_depth",1,2000),
        "min_samples_split":trial.suggest_int("min_samples_split",1,2000),
        "min_samples_leaf":trial.suggest_int("min_samples_leaf",1,2000),
        "max_leaf_nodes":trial.suggest_int("max_leaf_nodes",1,2000)
        
    }
    
#     param= {
#          "solver":trial.suggest_categorical("solver",["newton-cg", "lbfgs","liblinear","sag","saga"]),
#          "class_weight":trial.suggest_categorical("class_weight",["balanced",None]),
#          "max_iter":trial.suggest_int("max_iter",1,2000),
#          "C":trial.suggest_float("C",1,2000),    
#     }
    
    sk_fold = StratifiedKFold(5)
    clf = DecisionTreeClassifier(**param)
    return cross_val_score(clf,x,y, n_jobs=-1, cv=sk_fold,scoring='roc_auc').mean()
    

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

In [None]:
study.best_value, study.best_params 

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size = 0.20)

In [None]:
model= DecisionTreeClassifier(**study.best_params)
model_trained = model.fit(x_train,y_train)
print(classification_report(y_val,model_trained.predict(x_val)))