In [None]:
import pandas as pd
import numpy as np
import random
random.seed(100)

In [None]:
train=pd.read_csv("../input/tabular-playground-series-apr-2021/train.csv")
test=pd.read_csv("../input/tabular-playground-series-apr-2021/test.csv")
sample=pd.read_csv("../input/tabular-playground-series-apr-2021/sample_submission.csv")

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
train.drop(columns=["PassengerId","Name"],inplace=True)
test.drop(columns=["PassengerId","Name"],inplace=True)

In [None]:
train.drop(columns=[],inplace=True)

In [None]:
train["Age"].fillna(train["Age"].median(),inplace=True)
train["Fare"].fillna(train["Fare"].median(),inplace=True)
test["Age"].fillna(test["Age"].median(),inplace=True)
test["Fare"].fillna(test["Fare"].median(),inplace=True)

In [None]:
train

In [None]:
train.dtypes

In [None]:
## Extract ticket category
def get_category(x):
    try:
       try:
           if int(x.split()[0]):
                return np.NaN
       except:
           return x.split()[0]
    except:
        return np.NaN

In [None]:
train["Ticket"]=train["Ticket"].apply(lambda x: get_category(x))
test["Ticket"]=test["Ticket"].apply(lambda x: get_category(x))

In [None]:
def cabin_class(x):
    try:
        return x[0]
    except:
        return np.NaN

In [None]:
train["Cabin"]=train["Cabin"].apply(lambda x : cabin_class(x))
test["Cabin"]=test["Cabin"].apply(lambda x : cabin_class(x))

In [None]:
train.isnull().mean()*100

In [None]:
train["Cabin"].fillna("Missing",inplace=True)
test["Cabin"].fillna("Missing",inplace=True)
train["Ticket"].fillna("Missing",inplace=True)
test["Ticket"].fillna("Missing",inplace=True)

In [None]:
train["Embarked"].fillna(train["Embarked"].mode()[0],inplace=True)
test["Embarked"].fillna(test["Embarked"].mode()[0],inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()
train["Sex"]=le.fit_transform(train["Sex"])
test["Sex"]=le.transform(test["Sex"])

In [None]:
le=LabelEncoder()
train["Ticket"]=le.fit_transform(train["Ticket"])
test["Ticket"]=le.transform(test["Ticket"])

In [None]:
le=LabelEncoder()
train["Cabin"]=le.fit_transform(train["Cabin"])
test["Cabin"]=le.transform(test["Cabin"])

In [None]:
le=LabelEncoder()
train["Embarked"]=le.fit_transform(train["Embarked"])
test["Embarked"]=le.transform(test["Embarked"])

In [None]:
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y=train_test_split(train.drop(columns=["Survived"]),train["Survived"],test_size=0.2,random_state=100)

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [None]:
clf=CatBoostClassifier(random_state=100)
clf.fit(train_X,train_y)

In [None]:
pred=clf.predict(test_X)
print(accuracy_score(test_y,pred))

In [None]:
## Let's install scikit-learn 0.22.0 as it is compatible with skopt
!pip install   scikit-learn==0.22.0

In [None]:

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer

In [None]:
clf = CatBoostClassifier(thread_count=2,
                         loss_function='Logloss',
                        
                         od_type = 'Iter',
                         verbose= False
                        )

In [None]:
search_spaces = {'iterations': Integer(10, 1000),
                 'depth': Integer(1, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'),
                 'bagging_temperature': Real(0.0, 1.0),
                 'border_count': Integer(1, 255),
                 'l2_leaf_reg': Integer(2, 30),
                 'scale_pos_weight':Real(0.01, 1.0, 'uniform')}

In [None]:
opt = BayesSearchCV(clf,
                    search_spaces,
                    verbose=2,
                    scoring="accuracy",
                    cv=5,
                    n_iter=100,
                    n_jobs=-1,  
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator':'GP'},
                    random_state=42)

In [None]:
# opt.fit(train_X,train_y)

In [None]:
# opt.best_params_

In [None]:
## Best model
tuned_clf=CatBoostClassifier(bagging_temperature=0.20401679944445256,border_count=215,depth=4,iterations=936,l2_leaf_reg=9,learning_rate=0.013872642181616959,
                            random_strength=6.418759045093663e-05,
                           scale_pos_weight=  0.9276818238164333,thread_count=2,
                         loss_function='Logloss',
                        
                         od_type = 'Iter',
                         verbose= False)

In [None]:
tuned_clf.fit(train_X,train_y)

In [None]:
pred=tuned_clf.predict(test_X)
print(accuracy_score(test_y,pred))

In [None]:
pd.DataFrame(tuned_clf.feature_importances_,index=train.drop(columns=["Survived"]).columns).sort_values(0).plot(kind="barh",figsize=(20,10))

In [None]:
sample["Survived"]=tuned_clf.predict(test)

In [None]:
sample.to_csv("testing_model_best_new.csv",index=0)