In [55]:
import pandas as pd
import numpy as np
import xgboost
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

In [5]:
#pip install xgboost

In [137]:
def process_new(data):
    print(data.shape)
    new_df = data.copy()
    data[['GroupId','GrpPassId']] = data['PassengerId'].str.split('_',expand=True)
    data[['Deck','Decknum','DeckSide']] = data['Cabin'].str.split('/',expand=True)
    data.loc[(data.Destination == 'TRAPPIST-1e' )  & (data.HomePlanet.isnull()), 'HomePlanet'] ='Earth'
    data.loc[(data.Destination == '55 Cancri e' )  & (data.HomePlanet.isnull()), 'HomePlanet'] = 'Europa'
    data.loc[(data.Destination == 'PSO J318.5-22' ) & (data.HomePlanet.isnull()) , 'HomePlanet'] = 'Earth'
    data['HomePlanet'].fillna(value='Earth',inplace=True)
    #data.dropna(how = 'all',inplace=True)
    #data['Expenditure'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] +data['VRDeck']
    #child_train_index = data[data['Age'] <= 12].index
    #data['Adult'] = 1
    #data['Adult'].loc[child_train_index] = 0
    #plt.figure(figsize=(15,8))
    #sns.countplot(x='Destination',hue='Deck',data=data)
    #plt.figure(figsize=(15,8))
    #sns.countplot(x='HomePlanet',hue='Deck',data=data)
    #plt.figure(figsize=(15,8))
    #sns.countplot(x='Destination',hue='Deck',data=data)
    #Create a new category and assign it to Destination
    data['Destination'].fillna('NullDestPlanet',inplace=True)
    data[['Fname','Lname']] = data['Name'].str.split(' ',expand=True)
    
    data['Age'] = data['Age'].fillna(data.groupby(['HomePlanet','Destination','Lname'])['Age'].transform('mean'))
    data['Age'] = data['Age'].fillna(data.groupby(['HomePlanet'])['Age'].transform('mean'))
    
    #Fill na of expenditure columns
    data['RoomService'] = data['RoomService'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['RoomService'].transform('mean'))
    data['RoomService'] = data['RoomService'].fillna(data.groupby(['HomePlanet'])['RoomService'].transform('mean'))
    data['FoodCourt'] = data['FoodCourt'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['FoodCourt'].transform('mean'))
    data['FoodCourt'] = data['FoodCourt'].fillna(data.groupby(['HomePlanet'])['FoodCourt'].transform('mean'))
    data['Spa'] = data['Spa'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['Spa'].transform('mean'))
    data['Spa'] = data['Spa'].fillna(data.groupby(['HomePlanet'])['Spa'].transform('mean'))
    data['VRDeck'] = data['VRDeck'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['VRDeck'].transform('mean'))
    data['VRDeck'] = data['VRDeck'].fillna(data.groupby(['HomePlanet'])['VRDeck'].transform('mean'))
    data['ShoppingMall'] = data['ShoppingMall'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['ShoppingMall'].transform('mean'))
    data['ShoppingMall'] = data['ShoppingMall'].fillna(data.groupby(['HomePlanet'])['ShoppingMall'].transform('mean'))

    #Recalculate Expenditure
    data['Expenditure'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] +data['VRDeck']
    
    data['VIP'] = data['VIP'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['VIP'].transform('max'))
    data['VIP'] = data['VIP'].fillna(data.groupby(['HomePlanet','Expenditure'])['VIP'].transform('max'))
    data['VIP'] = data['VIP'].fillna(data.groupby(['GroupId'])['VIP'].transform('max'))
    data['VIP'] = data['VIP'].fillna(data.groupby('Expenditure')['VIP'].transform('max'))
    data['VIP'] = data['VIP'].fillna(False)
    
    data['Lname'] = data.groupby(['HomePlanet','Destination'])['Lname'].apply(lambda x : x.fillna(x.value_counts().index[0]))
    
    cols_to_drop = ['Cabin','Name','Expenditure','Fname']
    train_df.drop(columns=cols_to_drop,axis=1,inplace=True)
    
    cat_cols = list(data.select_dtypes('object').columns.values)
    cat_cols.remove('PassengerId')
    
    tf = {True:1,False:0}
    home_planet = {'Earth':3,'Europa':3,'Mars':1}
    dest_planet = {'TRAPPIST-1e':3,'Cancri e':2,'PSO J318.5-22':1,'NullDestPlanet':4}
    deck = {'F':7,'E':6,'D':5,'C':4,'B':3,'A':2,'T':1}
    port = {'S':0,'P':1}
    
    data['CryoSleep'] = data['CryoSleep'].map(tf)
    data['Destination'] = data['Destination'].map(dest_planet)
    data['HomePlanet'] = data['HomePlanet'].map(home_planet)
    data['VIP'] = data['VIP'].map(tf)
    data['Deck'] = data['Deck'].map(deck)
    data['DeckSide'] = data['DeckSide'].map(port)
    #data['Transported'] = data['Transported'].map(tf)
    data['Decknum'].fillna(0,inplace=True)
    data['Decknum'] = data['Decknum'].astype(int)
    data['GroupId'].fillna(0,inplace=True)
    data['GroupId'] = data['GroupId'].astype(int)
    data['GrpPassId'].fillna(0,inplace=True)
    data['GrpPassId'] = data['GroupId'].astype(int)
    
    """data_enc = train_df['Lname'].values.reshape(-1,1)
    ord_enc = OrdinalEncoder()
    data['Lname'] = ord_enc.fit_transform(data_enc)"""
    
    

    
    return data

In [138]:
train_df = pd.read_csv('data/train.csv')

In [139]:
cleaned_train_data = process_new(train_df)

(8693, 14)


In [140]:
cleaned_train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   int64  
 2   CryoSleep     8476 non-null   float64
 3   Destination   6893 non-null   float64
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   int64  
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   bool   
 12  GroupId       8693 non-null   int64  
 13  GrpPassId     8693 non-null   int64  
 14  Deck          5935 non-null   float64
 15  Decknum       8693 non-null   int64  
 16  DeckSide      8494 non-null   float64
 17  Lname         8693 non-null   object 
dtypes: bool(1), float64(10), int

In [48]:
cleaned_train_data.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupId,GrpPassId,Deck,Decknum,DeckSide,Lname
0,0001_01,3,0.0,3.0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,1,1,3.0,0,1.0,1431.0
1,0002_01,3,0.0,3.0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,2,2,7.0,0,0.0,2109.0
2,0003_01,3,0.0,3.0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,3,3,2.0,0,0.0,1990.0


In [142]:
X1 =cleaned_train_data.copy()

In [143]:
y1 = X1['Transported']
X1.drop(labels=['Transported','PassengerId','Lname'],axis=1,inplace=True)

In [144]:
y1.head(3)

0    False
1     True
2    False
Name: Transported, dtype: bool

In [145]:
X1.head(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,GrpPassId,Deck,Decknum,DeckSide
0,3,0.0,3.0,39.0,0,0.0,0.0,0.0,0.0,0.0,1,1,3.0,0,1.0
1,3,0.0,3.0,24.0,0,109.0,9.0,25.0,549.0,44.0,2,2,7.0,0,0.0
2,3,0.0,3.0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,3,3,2.0,0,0.0


In [146]:
X1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8693 non-null   int64  
 1   CryoSleep     8476 non-null   float64
 2   Destination   6893 non-null   float64
 3   Age           8693 non-null   float64
 4   VIP           8693 non-null   int64  
 5   RoomService   8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   Spa           8693 non-null   float64
 9   VRDeck        8693 non-null   float64
 10  GroupId       8693 non-null   int64  
 11  GrpPassId     8693 non-null   int64  
 12  Deck          5935 non-null   float64
 13  Decknum       8693 non-null   int64  
 14  DeckSide      8494 non-null   float64
dtypes: float64(10), int64(5)
memory usage: 1018.8 KB


In [147]:
def train_xgb_model(parameters):
    model = xgboost.XGBClassifier(num_boost_round=40)
    model.set_params(**parameters)
#    model = xgboost.XGBClassifier(n_estimators = parameters['n_estimator'],objective = parameters['objective'], colsample_bytree = parameters['colsample_bytree'], learning_rate = parameters['learning_rate'],
#                max_depth = parameters['max_depth'], alpha = parameters['alpha'])
    return model
def fit_model(model,X_train,y_train):
    model.fit(X_train,y_train)
    
def predict_results(model,X):
    predicted_values = model.predict(X)
    return predicted_values
    
def calculate_acc(model,y_actual,y_predicted):
    rmse = accuracy_score(y_actual,y_predicted)
    return rmse
    
def retune_model(model,hyperparameter_grid,X_train,y_train):
        
        grid_search_cv = RandomizedSearchCV(
        estimator=model,
        param_distributions=hyperparameter_grid,
        cv=10,n_iter=100,
        scoring = 'neg_mean_absolute_error',
        n_jobs=4,
        verbose=5,
        return_train_score=True,
        random_state=42
        )
        grid_search_cv.fit(X_train,y_train)
        return grid_search_cv
    

In [148]:
x1_train,x1_test,y1_train,y1_test = train_test_split(X1,y1,test_size=0.3)
parameters = {'objective' :'reg:squarederror', 'colsample_bytree' : 0.3, 'learning_rate' : 0.1,
                'max_depth' : 5, 'alpha' : 10, 'n_estimator' : 10}
model_1 = train_xgb_model(parameters)
fit_model(model_1,x1_train,y1_train)
model_1.fit(x1_train,y1_train)
y1_pred = predict_results(model_1,x1_test)
acc = calculate_acc(model_1,y1_test,y1_pred)
acc*100



Parameters: { "n_estimator", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimator", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




80.67484662576688

In [151]:
y1_train.head(3)

1653     True
1878    False
770     False
Name: Transported, dtype: bool

In [153]:
y1_train = y1_train.astype(int)

In [154]:
#Retune Model_1
booster = ['gbtree','gblinear']
n_estimator = [10,50,100,200,]
max_depth = [2,3,5,10,15,20,50]
learning_rate = [0.01,0.05,0.1,0.2,0.3,0.5,0.7]
min_child_weight = [1,5,10,15,20]
base_score=[0.25,0.5,0.75,1]
aplha = [5,10,15,50,100]
objective = ['reg:squarederror']

hyperparameter_grid = {
    'n_estimator' : n_estimator,
    'objective' : objective,
    'max_depth' : max_depth,
    'learning_rate' : learning_rate,
    'min_child_weight' : min_child_weight,
    'booster' : booster,
    'alpha' : aplha
}

grid_search_cv = RandomizedSearchCV(
        estimator=model_1,
        param_distributions=hyperparameter_grid,
        cv=5,n_iter=100,
        scoring = 'neg_mean_absolute_error',
        n_jobs=5,
        verbose=5,
        return_train_score=True,
        random_state=42,
        
)
grid_search_cv.fit(x1_train,y1_train)
grid_search_cv.best_params_


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    9.6s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:   36.5s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  1.6min
[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:  2.7min
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed:  4.0min
[Parallel(n_jobs=5)]: Done 500 out of 500 | elapsed:  4.5min finished


Parameters: { "n_estimator", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




{'objective': 'reg:squarederror',
 'n_estimator': 100,
 'min_child_weight': 10,
 'max_depth': 10,
 'learning_rate': 0.3,
 'booster': 'gbtree',
 'alpha': 100}

In [155]:
parameters = {'objective' :'reg:squarederror', 'colsample_bytree' : 0.4, 'learning_rate' : 0.3,
                'max_depth' : 10, 'alpha' : 10, 'n_estimator' : 10,'min_child_weight': 20,'booster': 'gbtree'}
parameters = {'objective' :'reg:squarederror', 'colsample_bytree' : 0.4, 'learning_rate' : 0.3,
                'max_depth' : 10, 'alpha' : 10, 'n_estimator' : 100,'min_child_weight': 10,'booster': 'gbtree'}

model_2_tuned = train_xgb_model(parameters)
fit_model(model_2_tuned,x1_train,y1_train)
#model_1_tuned.fit(x1_train,y1_train)
y1_pred_tuned = predict_results(model_2_tuned,x1_test)
acc1_tuned = calculate_acc(model_2_tuned,y1_test,y1_pred_tuned)
acc1_tuned*100

Parameters: { "n_estimator", "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




81.63343558282209

In [160]:
test_df = pd.read_csv('data/test.csv')

In [161]:
train_df = pd.read_csv('data/test.csv')
cleaned_test_data = process_new(train_df)
cleaned_test_data.drop(labels=['PassengerId','Lname'],axis=1,inplace=True)


(4277, 13)


In [162]:
cleaned_test_data.head(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,GrpPassId,Deck,Decknum,DeckSide
0,3,1.0,3.0,27.0,0,0.0,0.0,0.0,0.0,0.0,13,13,,3,0.0
1,3,0.0,3.0,19.0,0,0.0,9.0,0.0,2823.0,0.0,18,18,7.0,4,0.0
2,3,1.0,,31.0,0,0.0,0.0,0.0,0.0,0.0,19,19,4.0,0,0.0


In [163]:
y1_pred_tuned_test = predict_results(model_2_tuned,cleaned_test_data)


In [164]:
recon = {0:False,1:True}

data_pred1 = {'PassengerId': test_df['PassengerId'], 'Transported': y1_pred_tuned_test}
data_pred1 =pd.DataFrame(data_pred1)
data_pred1['Transported'] = data_pred1['Transported'].map(recon)

In [165]:
data_pred1.head(3)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True


In [166]:
data_pred1.to_csv('data/submission_08_04_22_7_06.csv',index=False)

In [135]:
#Predict Columns Deck 

In [177]:
def process_new(data):
    print(data.shape)
    new_df = data.copy()
    data[['GroupId','GrpPassId']] = data['PassengerId'].str.split('_',expand=True)
    data[['Deck','Decknum','DeckSide']] = data['Cabin'].str.split('/',expand=True)
    data.loc[(data.Destination == 'TRAPPIST-1e' )  & (data.HomePlanet.isnull()), 'HomePlanet'] ='Earth'
    data.loc[(data.Destination == '55 Cancri e' )  & (data.HomePlanet.isnull()), 'HomePlanet'] = 'Europa'
    data.loc[(data.Destination == 'PSO J318.5-22' ) & (data.HomePlanet.isnull()) , 'HomePlanet'] = 'Earth'
    data['HomePlanet'].fillna(value='Earth',inplace=True)
    #data.dropna(how = 'all',inplace=True)
    #data['Expenditure'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] +data['VRDeck']
    #child_train_index = data[data['Age'] <= 12].index
    #data['Adult'] = 1
    #data['Adult'].loc[child_train_index] = 0
    #plt.figure(figsize=(15,8))
    #sns.countplot(x='Destination',hue='Deck',data=data)
    #plt.figure(figsize=(15,8))
    #sns.countplot(x='HomePlanet',hue='Deck',data=data)
    #plt.figure(figsize=(15,8))
    #sns.countplot(x='Destination',hue='Deck',data=data)
    #Create a new category and assign it to Destination
    data['Destination'].fillna('NullDestPlanet',inplace=True)
    data[['Fname','Lname']] = data['Name'].str.split(' ',expand=True)
    
    data['Age'] = data['Age'].fillna(data.groupby(['HomePlanet','Destination','Lname'])['Age'].transform('mean'))
    data['Age'] = data['Age'].fillna(data.groupby(['HomePlanet'])['Age'].transform('mean'))
    
    #Fill na of expenditure columns
    data['RoomService'] = data['RoomService'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['RoomService'].transform('mean'))
    data['RoomService'] = data['RoomService'].fillna(data.groupby(['HomePlanet'])['RoomService'].transform('mean'))
    data['FoodCourt'] = data['FoodCourt'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['FoodCourt'].transform('mean'))
    data['FoodCourt'] = data['FoodCourt'].fillna(data.groupby(['HomePlanet'])['FoodCourt'].transform('mean'))
    data['Spa'] = data['Spa'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['Spa'].transform('mean'))
    data['Spa'] = data['Spa'].fillna(data.groupby(['HomePlanet'])['Spa'].transform('mean'))
    data['VRDeck'] = data['VRDeck'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['VRDeck'].transform('mean'))
    data['VRDeck'] = data['VRDeck'].fillna(data.groupby(['HomePlanet'])['VRDeck'].transform('mean'))
    data['ShoppingMall'] = data['ShoppingMall'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['ShoppingMall'].transform('mean'))
    data['ShoppingMall'] = data['ShoppingMall'].fillna(data.groupby(['HomePlanet'])['ShoppingMall'].transform('mean'))

    #Recalculate Expenditure
    data['Expenditure'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] +data['VRDeck']
    
    data['VIP'] = data['VIP'].fillna(data.groupby(['HomePlanet','Destination','Decknum','Lname'])['VIP'].transform('max'))
    data['VIP'] = data['VIP'].fillna(data.groupby(['HomePlanet','Expenditure'])['VIP'].transform('max'))
    data['VIP'] = data['VIP'].fillna(data.groupby(['GroupId'])['VIP'].transform('max'))
    data['VIP'] = data['VIP'].fillna(data.groupby('Expenditure')['VIP'].transform('max'))
    data['VIP'] = data['VIP'].fillna(False)
    
    data['Lname'] = data.groupby(['HomePlanet','Destination'])['Lname'].apply(lambda x : x.fillna(x.value_counts().index[0]))
    
    cols_to_drop = ['Cabin','Name','Expenditure','Fname']
    train_df.drop(columns=cols_to_drop,axis=1,inplace=True)
    
    cat_cols = list(data.select_dtypes('object').columns.values)
    cat_cols.remove('PassengerId')
    
    tf = {True:1,False:0}
    home_planet = {'Earth':3,'Europa':3,'Mars':1}
    dest_planet = {'TRAPPIST-1e':3,'Cancri e':2,'PSO J318.5-22':1,'NullDestPlanet':4}
    deck = {'F':7,'E':6,'D':5,'C':4,'B':3,'A':2,'T':1}
    port = {'S':0,'P':1}
    
    data['CryoSleep'] = data['CryoSleep'].map(tf)
    data['Destination'] = data['Destination'].map(dest_planet)
    data['HomePlanet'] = data['HomePlanet'].map(home_planet)
    data['VIP'] = data['VIP'].map(tf)
    """data['Deck'] = data['Deck'].map(deck)
    data['DeckSide'] = data['DeckSide'].map(port)
    #data['Transported'] = data['Transported'].map(tf)
    data['Decknum'].fillna(0,inplace=True)
    data['Decknum'] = data['Decknum'].astype(int)
    data['GroupId'].fillna(0,inplace=True)
    data['GroupId'] = data['GroupId'].astype(int)
    data['GrpPassId'].fillna(0,inplace=True)
    data['GrpPassId'] = data['GroupId'].astype(int)"""
    
    data_enc = train_df['Lname'].values.reshape(-1,1)
    ord_enc = OrdinalEncoder()
    data['Lname'] = ord_enc.fit_transform(data_enc)
    return data

In [178]:
train_df = pd.read_csv('data/train.csv')

In [179]:
train_process_data = process_new(train_df)

(8693, 14)


In [180]:
train_process_data.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupId,GrpPassId,Deck,Decknum,DeckSide,Lname
0,0001_01,3,0.0,3.0,39.0,0,0.0,0.0,0.0,0.0,0.0,False,1,1,B,0,P,1431.0
1,0002_01,3,0.0,3.0,24.0,0,109.0,9.0,25.0,549.0,44.0,True,2,1,F,0,S,2109.0
2,0003_01,3,0.0,3.0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False,3,1,A,0,S,1990.0


In [181]:
X1_Deck = train_process_data.copy()

In [182]:
X1_Deck_test = X1_Deck[X1_Deck['Deck'].isna()]

In [185]:
X1_Deck_test.shape

(199, 18)

In [187]:
y1_Deck = X1_Deck['Deck']
X1_Deck_test.drop(labels=['Transported','PassengerId','Decknum','DeckSide'],axis=1,inplace=True)
X1_Deck.drop(labels=['Transported','PassengerId','Decknum','DeckSide'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [188]:
X1_Deck_test.head(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,GrpPassId,Deck,Lname
15,3,0.0,3.0,31.0,0,32.0,0.0,876.0,0.0,0.0,12,1,,1573.0
93,1,1.0,3.0,31.0,0,0.0,0.0,0.0,0.0,0.0,101,1,,2043.0
103,3,0.0,3.0,32.0,0,0.0,410.0,6.0,3929.0,764.0,110,1,,26.0


In [189]:
X1_Deck.head(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,GrpPassId,Deck,Lname
0,3,0.0,3.0,39.0,0,0.0,0.0,0.0,0.0,0.0,1,1,B,1431.0
1,3,0.0,3.0,24.0,0,109.0,9.0,25.0,549.0,44.0,2,1,F,2109.0
2,3,0.0,3.0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,3,1,A,1990.0


In [190]:
deck = {'F':7,'E':6,'D':5,'C':4,'B':3,'A':2,'T':1}
X1_Deck['Deck'] = X1_Deck['Deck'].map(deck)

In [191]:
X1_Deck.head(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,GrpPassId,Deck,Lname
0,3,0.0,3.0,39.0,0,0.0,0.0,0.0,0.0,0.0,1,1,3.0,1431.0
1,3,0.0,3.0,24.0,0,109.0,9.0,25.0,549.0,44.0,2,1,7.0,2109.0
2,3,0.0,3.0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,3,1,2.0,1990.0


In [192]:
X1_Deck_test.head(3)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupId,GrpPassId,Deck,Lname
15,3,0.0,3.0,31.0,0,32.0,0.0,876.0,0.0,0.0,12,1,,1573.0
93,1,1.0,3.0,31.0,0,0.0,0.0,0.0,0.0,0.0,101,1,,2043.0
103,3,0.0,3.0,32.0,0,0.0,410.0,6.0,3929.0,764.0,110,1,,26.0


In [193]:
x1_Deck_train,x1_Deck_test,y1_Deck_train,y1_Deck_test = train_test_split(X1_Deck.drop(labels=['Deck'],axis=1),X1_Deck['Deck'],test_size=0.3)

In [194]:
parameters = {'objective' :'reg:squarederror', 'colsample_bytree' : 0.3, 'learning_rate' : 0.1,
                'max_depth' : 5, 'alpha' : 10, 'n_estimator' : 10}
Deck_model_1 = train_xgb_model(parameters)
fit_model(Deck_model_1,x1_Deck_train,y1_Deck_train)
Deck_model_1.fit(x1_Deck_train,y1_Deck_train)
y1_Deck_pred = predict_results(Deck_model_1,x1_Deck_train)
acc = calculate_acc(Deck_model_1,y1_Deck_test,y1_Deck_pred)
acc*100



ValueError: y contains previously unseen labels: [nan]

In [196]:
y1_Deck_train.isna().sum()

1937

In [197]:
len(y1_Deck_train)

6085