In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import warnings
warnings.simplefilter('ignore')

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_selection import mutual_info_classif

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from hyperopt import hp,fmin,tpe,Trials
from hyperopt.pyll.stochastic import sample
from hyperopt.pyll.base import scope
from functools import partial

In [None]:
# some generally useful functions

def plotter(data,columns,fig_size):
    """function plots the density for bunch of columns in a data"""
    rows=math.ceil(len(columns)/4)
    i=0
    sns.set_style('darkgrid')
    plt.subplots(rows,4,figsize=fig_size)
    plt.tight_layout()
    for col in columns:
        i+=1
        plt.subplot(rows,4,i)
        sns.kdeplot(data[col],shade=True)
        
# learning curve
class Learning_curve:
    """plots the learning curve"""
    def __init__(self,train_x,train_y,val_x,val_y,model):
        self.train_x=train_x
        self.train_y=train_y
        self.val_x=val_x
        self.val_y=val_y
        self.model=model
        
    def learning_curve(self):
        loss_tr=[]
        loss_val=[]
        points=np.linspace(10,len(self.train_x),50)
        for i in points:
            i=math.ceil(i)
            score=cross_val_score(self.model,self.train_x[:i],self.train_y[:i],cv=3,scoring='accuracy').mean()
            self.model.fit(self.train_x[:i],self.train_y[:i])
            loss1=1-score
            loss2=1-accuracy_score(self.val_y,self.model.predict(self.val_x))
            loss_tr.append(loss1)
            loss_val.append(loss2)
        return loss_tr,loss_val
    def plot(self):
        l1,l2=self.learning_curve()
        plt.plot([i for i in range(len(l1))],l1)
        plt.plot([i for i in range(len(l2))],l2)
        plt.ylim((0,0.3))
        plt.xlabel('Iterations')
        plt.ylabel('Loss')
        plt.title('Learning Curve')
        plt.legend(['train','test'])


        
def data_info(data):
    """returns some basic info on data"""
    details={'info':data.info(),'description':data.describe(),'null':data.isna().sum()}
    return details


def fetch_submission(predictions):
    """function that outputs the submission file given input the raw predictions
    output from a model"""
    
    test_original=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
    test_original['Transported']=predictions.astype(bool)
    submission=test_original[['PassengerId','Transported']]
    submission.to_csv('submission.csv',index=False)
    
def plotmi(mi): 
    """plots the mi scores returned from function-mutual_information"""
    sns.barplot(mi['mi_score'],mi.index)
    plt.xlabel('Score')
    plt.title('Mutual Information');

In [None]:
#preprocessing 
# problem specific functions for preprocessing

def data_load(train_path, test_path):
    """loads all data"""
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train, val = train_test_split(train, test_size=0.2, random_state=42)
    return train, test, val


def column_transformer1(data):
    """expanding cabin and passeengerid"""
    data.columns = [i.lower() for i in data.columns]
    data[['deck', 'num', 'side']] = data['cabin'].str.split('/', expand=True)
    data['num'].astype(float)
    data.drop(['cabin', 'name','num','side'], axis=1, inplace=True)
    data[['group', 'passenger']] = data['passengerid'].str.split(
        '_', expand=True).astype(int)
    data.drop('passengerid', axis=1, inplace=True)
    return data


def transformation(data):
    """some categorical ordinal mapping"""
    planet = {'Earth': 1, 'Europa': 2, 'Mars': 3}
    data.homeplanet = data.homeplanet.map(planet)
    destination = {'TRAPPIST-1e': 1, '55 Cancri e': 2, 'PSO J318.5-22': 3}
    data.destination = data.destination.map(destination)
    data[['cryosleep', 'vip', ]] = data[['cryosleep', 'vip', ]].astype(float)
    deck = {'F': 1, 'C': 2, 'G': 3, 'B': 4, 'E': 5, 'D': 6, 'A': 7, 'T': 8}
    data.deck = data.deck.map(deck)
    # data.drop(['side', 'num'], 1, inplace=True)
    try:
        # cuz we gonna use train data in this fn as well which doesnt have transported column
        data['transported'] = data['transported'].astype(float)
        return data
    except:
        return data


def impute_split(data):
    # this is used for training knn
    data_good = data[data.isna().sum(axis=1).eq(0)]
    data_transformable = data[data.isna().sum(axis=1).eq(1)]  # transformed wwith knn
    data_non_transformable = data[data.isna().sum(axis=1).gt(1)]
    return data_good, data_transformable, data_non_transformable


class Knn_imputation:
    """defining knn classifier and regressor """
    knn = KNeighborsClassifier(n_neighbors=100)
    knn_r = KNeighborsRegressor(n_neighbors=100)

    def __init__(self, training_data, transforming_data, rest_of_data, categorical, numerical):
        """instance takes input training and transforming data"""
        self.training_data = training_data
        self.transforming_data = transforming_data
        self.rest_of_data = rest_of_data
        self.categorical = categorical
        self.numerical = numerical

    def trans_cols(self):
        cat = [i for i in self.transforming_data.columns[self.transforming_data.isna(
        ).sum().gt(0)] if i in self.categorical]
        num = [i for i in self.transforming_data.columns[self.transforming_data.isna(
        ).sum().gt(0)] if i in self.numerical]
        return cat, num

    def knn_impute_cat(self):
        """columns must be categorical
            columns are the columns which has non zero nan values
           trasformable data are data you wanna impute using knn 
           but only has esxactly one nan value per row ,so that we 
           can train it efectively"""
        cat_columns, _ = self.trans_cols()
        for i in cat_columns:
            train_x = self.training_data.drop(i, axis=1)
            train_y = self.training_data[i]
            test_x = self.transforming_data[self.transforming_data[i].isna()]
            test_x = test_x.drop(i, axis=1)
            self.knn.fit(train_x, train_y)
            preds = self.knn.predict(test_x)
            ind = test_x.index
            self.transforming_data.loc[ind, i] = preds
        return self.transforming_data

    def knn_impute_num(self):
        """columns must be categorical
            columns are the columns which has non zero nan values
           trasformable data are data you wanna impute using knn 
           but only has esxactly one nan value per row ,so that we 
           can train it efectively"""
        _, num_columns = self.trans_cols()
        for i in num_columns:
            train_x = self.training_data.drop(i, axis=1)
            train_y = self.training_data[i]
            test_x = self.transforming_data[self.transforming_data[i].isna()]
            test_x = test_x.drop(i, axis=1)
            self.knn_r.fit(train_x, train_y)
            preds = self.knn_r.predict(test_x)
            ind = test_x.index
            self.transforming_data.loc[ind, i] = preds
        return self.transforming_data

    def knn_implement(self):
        """implementing knn imputation"""
        cat, num = self.trans_cols()
        data_transformable_im = self.knn_impute_cat()
        data_transformable_im = self.knn_impute_num()
        result = pd.concat([self.training_data, data_transformable_im,
                            self.rest_of_data]).sort_index(ascending=True)
        return result


def simple_im(train, test, val, categorical, numerical):
    # we dont want to impute names
    # but we'd impute the rest of the things based i=on median and equalent strategies

    im_c = SimpleImputer(strategy='most_frequent')
    im_n = SimpleImputer(strategy='median')

    train[categorical] = im_c.fit_transform(train[categorical])
    train[numerical] = im_n.fit_transform(train[numerical])

    val[categorical] = im_c.transform(val[categorical])
    val[numerical] = im_n.transform(val[numerical])

    test['transported'] = np.zeros([test.shape[0]])
    test[categorical] = im_c.transform(test[categorical])
    test[numerical] = im_n.transform(test[numerical])
    test.drop('transported', axis=1, inplace=True)
    return train, test, val

In [None]:
# features
# useful functions generally for feature engineering

def mutual_information(x,y,mask=None):
    """function calculates the mi score in descendinhg trend given x and y"""
    if mask is not None:
        mi=mutual_info_classif(x.iloc[:,:mask],y)
        mi=pd.DataFrame(mi,columns=['mi_score'],index=x.columns[:mask])
    elif mask is None:  
        mi=mutual_info_classif(x,y)
        mi=pd.DataFrame(mi,columns=['mi_score'],index=x.columns)
        
    mi=mi.sort_values("mi_score",ascending=False)
    return mi


def pca_ing(x,standardize=True):
    """function standardizes the data is not standardized and performs pca and outputs its componets in a df also loadings"""
    if standardize:
        sc=StandardScaler()
        x_scaled=sc.fit_transform(x)
        x=pd.DataFrame(x_scaled,columns=x.columns)
    pca=PCA()
    x_pca=pca.fit_transform(x)
    components=[f'pca_{i}' for i in x.columns.values]
    x_pca=pd.DataFrame(x_pca,columns=components)
    loadings=pd.DataFrame(pca.components_.T,columns=components,index=x.columns)
    return x_pca,loadings

def auto_best_features(x,y,other_data,n_features,standardize_on_pca=True):
    """best features(having most mi scores) among all of x and its pca """
    x_pca,_=pca_ing(x,standardize=standardize_on_pca)
    x.reset_index(drop=True,inplace=True)
    all_features=x.join(x_pca)
    mutual_info=mutual_information(all_features,y)
    selected_cols=mutual_info.index.values[:n_features]
    other_data_selected=[]
    for i in other_data:
        i_pca,_=pca_ing(i,standardize=standardize_on_pca)
        i.reset_index(drop=True,inplace=True)
        i_all_features=i.join(i_pca)
        other_data_selected.append(i_all_features[selected_cols])
    return all_features[selected_cols],other_data_selected            
        


#new features
# problem specific
# here i'm not using this function much better features can be made after a good eda but,these are some potential good ones
def create_features(data):
    """function creates the following sorts of features for a given data"""
    
    #technical ones
    data[['roomservice','foodcourt','shoppingmall','spa','vrdeck']]=data[['roomservice','foodcourt','shoppingmall','spa','vrdeck']].apply(np.log1p)
    
    #feature development basis 
    data['mall_avrg']= data.groupby(['deck','homeplanet'])['shoppingmall'].transform('mean')
    data['food_avrg']= data.groupby(['deck','homeplanet'])['foodcourt'].transform('mean')
    data['cnt_deckplanet']= data.groupby(['deck','homeplanet'])['homeplanet'].transform('count')
    
    # cryosleep feature (it has high mutual info)
    data['cnt_cryodeckplnt']= data.groupby(['deck','homeplanet'])['cryosleep'].transform('count')
    data['total_spend']=data[['roomservice','foodcourt','shoppingmall','spa','vrdeck']].sum(axis=1)
    data['spend_sub1']=data[['foodcourt','shoppingmall']].sum(axis=1)
    data['spend_sub2']=data[['roomservice','spa','vrdeck']].sum(axis=1)
    return data

In [None]:
# model

model=RandomForestClassifier()

def results(trainx,trainy,valx,valy,test,params,model=model):
    """function that trains a model and validates it"""
    model=model.set_params(**params)
    folds=KFold(n_splits=5) #once set this seems reproduceable than settting cv in cross val score
    tr_score=cross_val_score(model,trainx,trainy,cv=folds,scoring='accuracy').mean()
    model.fit(trainx,trainy)
    vl_score=accuracy_score(model.predict(valx),valy)
    test_preds=model.predict(test)
    return tr_score,vl_score,test_preds

In [None]:
# optimization
# bayesian search implementation
p_space={
        'n_estimators':scope.int(hp.quniform('n_estimators',int(10),int(100),int(1))),
        'max_depth':scope.int(hp.quniform('max_depth',1,50,1)),
        'min_samples_split':scope.int(hp.quniform('min_samples_split',2,20,1)),
        'min_samples_leaf':scope.int(hp.quniform('min_samples_leaf',2,20,1)),
        'max_features':hp.quniform('max_features',0.1,1,0.1),
        'bootstrap':hp.choice('bootstrap',[True,False]),
        'criterion':hp.choice('criterion',['gini','entropy'])
        }

rf=model
def optimizer(param_space,trainx,trainy,valx,valy,model=rf):
    model=rf.set_params(**param_space)
    model.fit(trainx,trainy)
    acc=accuracy_score(model.predict(valx),valy)
    return -1*acc

def bayesian_search(trainx,trainy,valx,valy,param_space=p_space):
    trials=Trials()
    op_fn=partial(optimizer,trainx=trainx,trainy=trainy,valx=valx,valy=valy)
    result=fmin(fn=op_fn,
            space=param_space,
            algo=tpe.suggest,
            trials=trials,
            max_evals=100
            )
    result['n_estimators']=int(result['n_estimators'])
    result['min_samples_split']=int(result['min_samples_split'])
    result[ 'min_samples_leaf']=int(result[ 'min_samples_leaf'])
    result[ 'criterion']=['gini','entropy'][int(result[ 'criterion'])]
    return result


def combining_models(preds1,preds2,preds3):
    """this function combines models predictions to make a final prediction"""
    preds=preds1+preds2+preds3
    preds[preds>1]=1
    preds[preds==1]==0
    return preds


In [None]:
# loading
train_path = '../input/spaceship-titanic/train.csv'
test_path = '../input/spaceship-titanic/test.csv'
train, test, val = data_load(train_path, test_path)


# transformation
train = column_transformer1(train)
val = column_transformer1(val)
test = column_transformer1(test)
categorical = train.select_dtypes('object', 'category')
bool_columns = train.select_dtypes('bool')
categorical = categorical.join(bool_columns).columns
numerical = train.select_dtypes('number').columns
train_copy = train.copy()
val_copy = val.copy()
test_copy = test.copy()


# more transformation
train = transformation(train)
val = transformation(val)
test = transformation(test)
train_good, train_transformable, train_non_transformable = impute_split(train)
val_good, val_transformable, val_non_transformable = impute_split(val)
test_good, test_transformable, test_non_transformable = impute_split(test)


# imputing using knn
train_imputer = Knn_imputation(
    train_good, train_transformable, train_non_transformable, categorical, numerical)
train = train_imputer.knn_implement()
val_imputer = Knn_imputation(
    val_good, val_transformable, val_non_transformable, categorical, numerical)
val = val_imputer.knn_implement()
test_imputer = Knn_imputation(
    test_good, test_transformable, test_non_transformable, categorical, numerical)
test = test_imputer.knn_implement()
train, test, val = simple_im(train, test, val, categorical, numerical)


# preparing data
trainx = train.drop(['transported'], axis=1)
trainy = train['transported'].astype(int)
valx = val.drop(['transported'], axis=1)
valy = val['transported'].astype(int)

# selecting best features
trainx, other = auto_best_features(trainx,trainy,[valx, test], n_features=15, standardize_on_pca=True)
valx, test = other[0], other[1]

In [None]:
# features density plot
plotter(trainx,trainx.columns,(10,10))

In [None]:
# features mutual info
mi=mutual_information(trainx,trainy)
plotmi(mi)

In [None]:
# evaluating
best_params = bayesian_search(trainx,trainy,valx,valy)
train_score, val_score,test_preds = results(trainx,trainy,valx,valy,test,best_params)
print('train score:', train_score)
print('val score:', val_score)
fetch_submission(test_preds)

In [None]:
lr=Learning_curve(trainx,trainy,valx,valy,RandomForestClassifier(**best_params))
lr.plot()