## Importing Libraries

In [753]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Combining train.csv with test.csv


In [754]:
train = pd.read_csv('train.csv')
test =  pd.read_csv('test.csv')
dataset =  pd.concat([train, test], axis=0, sort=False)

dataset.to_csv('dataset.csv', index=False)


## Getting all the information from the features

In [755]:
def MoreInfo(df):
    # Derive column 'Deck' from column: 'Cabin'
    df.insert(4, "Deck", df["Cabin"].str.split("/").str[0])

    # Derive column 'CabinNumber' from column: 'Cabin'
    df.insert(4, "CabinNumber", df["Cabin"].str.split("/").str[1])

    # Derive column 'Side' from column: 'Cabin'
    df.insert(4, "Side", df["Cabin"].str.split("/").str[-1])

    # Drop column: 'Cabin'
    df = df.drop(columns=['Cabin'])

    # Derive column 'LastName' from column: 'Name'
    df.insert(15, "LastName", df["Name"].str.split(" ").str[-1])

    # Derive column 'FirstName' from column: 'Name'
    df.insert(15, "FirstName", df["Name"].str.split(" ").str[0])

    # Drop column: 'Name'
    df = df.drop(columns=['Name'])

    # Derive column 'Group' from column: 'PassengerId'
    df.insert(1, "Group", df["PassengerId"].str.split("_").str[0])

    # Derive column 'GroupNumber' from column: 'Group'
    df.insert(2, "GroupNumber", df["Group"].str[3:])
    
    # Drop column: 'Group'
    df = df.drop(columns=['Group'])

    # Derive column 'GroupMemberNumber' from column: 'PassengerId'
    df.insert(1, "GroupMemberNumber", df.apply(lambda row : row["PassengerId"][row["PassengerId"].find("_") + 2:], axis=1))
    
    # Drop column: 'PassengerId'
    df = df.drop(columns=['PassengerId'])
    return df


df = MoreInfo(dataset.copy())
df.head()

Unnamed: 0,GroupMemberNumber,GroupNumber,HomePlanet,CryoSleep,Side,CabinNumber,Deck,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,FirstName,LastName,Transported
0,1,1,Europa,False,P,0,B,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham,Ofracculy,False
1,1,2,Earth,False,S,0,F,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna,Vines,True
2,1,3,Europa,False,S,0,A,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark,Susent,False
3,2,3,Europa,False,S,0,A,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam,Susent,False
4,1,4,Earth,False,S,1,F,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy,Santantines,True


## Starting EDA

In [756]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12970 entries, 0 to 4276
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GroupMemberNumber  12970 non-null  object 
 1   GroupNumber        12970 non-null  object 
 2   HomePlanet         12682 non-null  object 
 3   CryoSleep          12660 non-null  object 
 4   Side               12671 non-null  object 
 5   CabinNumber        12671 non-null  object 
 6   Deck               12671 non-null  object 
 7   Destination        12696 non-null  object 
 8   Age                12700 non-null  float64
 9   VIP                12674 non-null  object 
 10  RoomService        12707 non-null  float64
 11  FoodCourt          12681 non-null  float64
 12  ShoppingMall       12664 non-null  float64
 13  Spa                12686 non-null  float64
 14  VRDeck             12702 non-null  float64
 15  FirstName          12676 non-null  object 
 16  LastName           12676 non

In [757]:
df.describe()   

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12700.0,12707.0,12681.0,12664.0,12686.0,12702.0
mean,28.771969,222.897852,451.961675,174.906033,308.476904,306.789482
std,14.387261,647.596664,1584.370747,590.55869,1130.279641,1180.097223
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,77.0,29.0,57.0,42.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [758]:
df.isnull().sum()

GroupMemberNumber       0
GroupNumber             0
HomePlanet            288
CryoSleep             310
Side                  299
CabinNumber           299
Deck                  299
Destination           274
Age                   270
VIP                   296
RoomService           263
FoodCourt             289
ShoppingMall          306
Spa                   284
VRDeck                268
FirstName             294
LastName              294
Transported          4277
dtype: int64

In [759]:
# calculate percentage of missing values
def missing(df):
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    missing = missing / len(df) * 100
    missing = missing.sort_values(ascending=False)
    missing = pd.DataFrame({'Missing Ratio': missing})
    return missing

missing(df)


Unnamed: 0,Missing Ratio
Transported,32.976099
CryoSleep,2.390131
ShoppingMall,2.359291
Side,2.30532
CabinNumber,2.30532
Deck,2.30532
VIP,2.28219
FirstName,2.266769
LastName,2.266769
FoodCourt,2.228219


About 2% for each feature with nan values; we start filling the holes
We start with CryoSleep ; Firstly, a logical argument would be that every person who didn't spend a penny in the trip is in CryoSleep and vice versa

In [760]:

# Calculate the total expenditure
df["Expenditure"] = df["FoodCourt"] + df["ShoppingMall"] + df["Spa"] + df["VRDeck"] + df["RoomService"]

df.loc[df["Expenditure"] == 0, "CryoSleep"] = True

df.loc[df["CryoSleep"] == True, ["FoodCourt", "ShoppingMall", "Spa", "VRDeck", "RoomService"]] = 0

df = df.drop(columns=['Expenditure'])

In [761]:
df["CryoSleep"].value_counts()


CryoSleep
False    7412
True     5371
Name: count, dtype: int64

In [762]:
import pandas as pd
import numpy as np

def fillna_groupby(df, target_col, group_col, method='mode'):
    """
    Fill NaN values in the target column based on the specified method (mode, median, mean)
    within each group defined by the group column. If all values in the target column of a group
    are NaN, fill them with the mean or mode of the entire target column.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    target_col (str): The column in which to fill NaN values.
    group_col (str): The column to group by.
    method (str): The method to use for filling NaN values ('mode', 'median', 'mean').

    Returns:
    pd.DataFrame: The DataFrame with NaN values filled.
    """
    
    if method == 'mode':
        fill_value = df.groupby(group_col)[target_col].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
    elif method == 'median':
        fill_value = df.groupby(group_col)[target_col].transform('median')
    elif method == 'mean':
        fill_value = df.groupby(group_col)[target_col].transform('mean')
    else:
        raise ValueError("Method must be 'mode', 'median', or 'mean'")

    # Fill NaN values in the target column with the calculated fill_value
    df[target_col] = df[target_col].fillna(fill_value)

    # Check for groups where all values in the target column are NaN
    all_nan_groups = df.groupby(group_col)[target_col].transform(lambda x: x.isna().all())
    
    # Fill with overall mean or mode if there are any such groups
    if all_nan_groups.any():
        if df[target_col].dtype in [np.float64, np.int64]:
            overall_fill_value = df[target_col].mean()
        else:
            overall_fill_value = df[target_col].mode().iloc[0] if not df[target_col].mode().empty else None
        df.loc[all_nan_groups, target_col] = df.loc[all_nan_groups, target_col].fillna(overall_fill_value)

    return df

# Example usage:
# df = fillna_groupby(df, 'CryoSleep', 'GroupNumber', method='mode')

In [763]:
fillna_groupby(df, "CryoSleep", "GroupNumber", method='mode')

missing(df)

Unnamed: 0,Missing Ratio
Transported,32.976099
Side,2.30532
CabinNumber,2.30532
Deck,2.30532
VIP,2.28219
FirstName,2.266769
LastName,2.266769
HomePlanet,2.220509
Destination,2.112567
Age,2.081727


In [764]:
df["Side"].value_counts()

Side
S    6381
P    6290
Name: count, dtype: int64

In [765]:
# fillna random for 'Side' because its a 50 50 split
import random
df["Side"] = df["Side"].fillna(random.choice(["P", "S"]))

missing(df)

Unnamed: 0,Missing Ratio
Transported,32.976099
CabinNumber,2.30532
Deck,2.30532
VIP,2.28219
FirstName,2.266769
LastName,2.266769
HomePlanet,2.220509
Destination,2.112567
Age,2.081727
FoodCourt,1.387818


In [766]:
fillna_groupby(df, "Deck", "GroupNumber", method='mode')

missing(df)

Unnamed: 0,Missing Ratio
Transported,32.976099
CabinNumber,2.30532
VIP,2.28219
FirstName,2.266769
LastName,2.266769
HomePlanet,2.220509
Destination,2.112567
Age,2.081727
FoodCourt,1.387818
Spa,1.364688


In [767]:
df.drop(columns=['CabinNumber', 'FirstName'], inplace=True)
missing(df)

Unnamed: 0,Missing Ratio
Transported,32.976099
VIP,2.28219
LastName,2.266769
HomePlanet,2.220509
Destination,2.112567
Age,2.081727
FoodCourt,1.387818
Spa,1.364688
VRDeck,1.364688
ShoppingMall,1.349268


In [768]:
fillna_groupby(df, "VIP", "GroupNumber", method='mode')
missing(df)

Unnamed: 0,Missing Ratio
Transported,32.976099
LastName,2.266769
HomePlanet,2.220509
Destination,2.112567
Age,2.081727
FoodCourt,1.387818
Spa,1.364688
VRDeck,1.364688
ShoppingMall,1.349268
RoomService,1.310717


In [769]:
fillna_groupby(df, "LastName", "GroupNumber", method='mode')
fillna_groupby(df, "HomePlanet", "GroupNumber", method='mode')
fillna_groupby(df, "Destination", "GroupNumber", method='mode')
fillna_groupby(df, "Age", "GroupNumber", method='mean')


missing(df)

Unnamed: 0,Missing Ratio
Transported,32.976099
FoodCourt,1.387818
Spa,1.364688
VRDeck,1.364688
ShoppingMall,1.349268
RoomService,1.310717


In [770]:
fillna_groupby(df, "FoodCourt", "GroupNumber", method='mean')

Unnamed: 0,GroupMemberNumber,GroupNumber,HomePlanet,CryoSleep,Side,Deck,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,LastName,Transported
0,1,1,Europa,True,P,B,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,Ofracculy,False
1,1,2,Earth,False,S,F,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,Vines,True
2,1,3,Europa,False,S,A,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,Susent,False
3,2,3,Europa,False,S,A,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,Susent,False
4,1,4,Earth,False,S,F,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,2,6,Earth,True,S,G,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,Peter,
4273,1,9,Earth,False,S,F,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,Scheron,
4274,1,1,Mars,True,P,D,55 Cancri e,28.992051,False,0.0,0.0,0.0,0.0,0.0,Pore,
4275,1,3,Europa,False,P,D,TRAPPIST-1e,28.857608,False,0.0,2680.0,0.0,0.0,523.0,Conale,


In [771]:
fillna_groupby(df, "Spa", "GroupNumber", method='mean')

Unnamed: 0,GroupMemberNumber,GroupNumber,HomePlanet,CryoSleep,Side,Deck,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,LastName,Transported
0,1,1,Europa,True,P,B,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,Ofracculy,False
1,1,2,Earth,False,S,F,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,Vines,True
2,1,3,Europa,False,S,A,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,Susent,False
3,2,3,Europa,False,S,A,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,Susent,False
4,1,4,Earth,False,S,F,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,2,6,Earth,True,S,G,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,Peter,
4273,1,9,Earth,False,S,F,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,Scheron,
4274,1,1,Mars,True,P,D,55 Cancri e,28.992051,False,0.0,0.0,0.0,0.0,0.0,Pore,
4275,1,3,Europa,False,P,D,TRAPPIST-1e,28.857608,False,0.0,2680.0,0.0,0.0,523.0,Conale,


In [772]:
fillna_groupby(df, "VRDeck", "GroupNumber", method='mean')

Unnamed: 0,GroupMemberNumber,GroupNumber,HomePlanet,CryoSleep,Side,Deck,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,LastName,Transported
0,1,1,Europa,True,P,B,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,Ofracculy,False
1,1,2,Earth,False,S,F,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,Vines,True
2,1,3,Europa,False,S,A,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,Susent,False
3,2,3,Europa,False,S,A,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,Susent,False
4,1,4,Earth,False,S,F,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,2,6,Earth,True,S,G,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,Peter,
4273,1,9,Earth,False,S,F,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,Scheron,
4274,1,1,Mars,True,P,D,55 Cancri e,28.992051,False,0.0,0.0,0.0,0.0,0.0,Pore,
4275,1,3,Europa,False,P,D,TRAPPIST-1e,28.857608,False,0.0,2680.0,0.0,0.0,523.0,Conale,


In [773]:
fillna_groupby(df, "RoomService", "GroupNumber", method='mean')

Unnamed: 0,GroupMemberNumber,GroupNumber,HomePlanet,CryoSleep,Side,Deck,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,LastName,Transported
0,1,1,Europa,True,P,B,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,Ofracculy,False
1,1,2,Earth,False,S,F,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,Vines,True
2,1,3,Europa,False,S,A,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,Susent,False
3,2,3,Europa,False,S,A,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,Susent,False
4,1,4,Earth,False,S,F,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,2,6,Earth,True,S,G,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,Peter,
4273,1,9,Earth,False,S,F,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,Scheron,
4274,1,1,Mars,True,P,D,55 Cancri e,28.992051,False,0.0,0.0,0.0,0.0,0.0,Pore,
4275,1,3,Europa,False,P,D,TRAPPIST-1e,28.857608,False,0.0,2680.0,0.0,0.0,523.0,Conale,


In [774]:
fillna_groupby(df, "ShoppingMall", "GroupNumber", method='mean')

Unnamed: 0,GroupMemberNumber,GroupNumber,HomePlanet,CryoSleep,Side,Deck,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,LastName,Transported
0,1,1,Europa,True,P,B,TRAPPIST-1e,39.000000,False,0.0,0.0,0.0,0.0,0.0,Ofracculy,False
1,1,2,Earth,False,S,F,TRAPPIST-1e,24.000000,False,109.0,9.0,25.0,549.0,44.0,Vines,True
2,1,3,Europa,False,S,A,TRAPPIST-1e,58.000000,True,43.0,3576.0,0.0,6715.0,49.0,Susent,False
3,2,3,Europa,False,S,A,TRAPPIST-1e,33.000000,False,0.0,1283.0,371.0,3329.0,193.0,Susent,False
4,1,4,Earth,False,S,F,TRAPPIST-1e,16.000000,False,303.0,70.0,151.0,565.0,2.0,Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,2,6,Earth,True,S,G,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0,Peter,
4273,1,9,Earth,False,S,F,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0,Scheron,
4274,1,1,Mars,True,P,D,55 Cancri e,28.992051,False,0.0,0.0,0.0,0.0,0.0,Pore,
4275,1,3,Europa,False,P,D,TRAPPIST-1e,28.857608,False,0.0,2680.0,0.0,0.0,523.0,Conale,


In [775]:
missing(df)

Unnamed: 0,Missing Ratio
Transported,32.976099


In [776]:
df.dtypes

GroupMemberNumber     object
GroupNumber           object
HomePlanet            object
CryoSleep               bool
Side                  object
Deck                  object
Destination           object
Age                  float64
VIP                     bool
RoomService          float64
FoodCourt            float64
ShoppingMall         float64
Spa                  float64
VRDeck               float64
LastName              object
Transported           object
dtype: object

In [777]:
df.head()

Unnamed: 0,GroupMemberNumber,GroupNumber,HomePlanet,CryoSleep,Side,Deck,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,LastName,Transported
0,1,1,Europa,True,P,B,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Ofracculy,False
1,1,2,Earth,False,S,F,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Vines,True
2,1,3,Europa,False,S,A,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Susent,False
3,2,3,Europa,False,S,A,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Susent,False
4,1,4,Earth,False,S,F,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Santantines,True


In [778]:
df.drop(columns=['GroupMemberNumber'], inplace=True)

In [779]:
# df.drop(columns=['LastName'], inplace=True)

## Simple OHE

In [780]:
# One-hot encode categorical columns excluding "Transported"
columns_to_encode = df.select_dtypes(include=['object']).columns.drop('Transported')
df = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)

In [781]:
print(df.shape)

(12970, 2435)


In [782]:
df.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupNumber_1,...,LastName_Wooterston,LastName_Workmans,LastName_Workmanson,LastName_Wrempeedly,LastName_Wriggins,LastName_Wynneyerson,LastName_Yanton,LastName_Yatters,LastName_Yorkland,LastName_Youngrayes
0,True,39.0,False,0.0,0.0,0.0,0.0,0.0,False,True,...,False,False,False,False,False,False,False,False,False,False
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True,False,...,False,False,False,False,False,False,False,False,False,False
2,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True,False,...,False,False,False,False,False,False,False,False,False,False


In [783]:
df.dtypes

CryoSleep                  bool
Age                     float64
VIP                        bool
RoomService             float64
FoodCourt               float64
                         ...   
LastName_Wynneyerson       bool
LastName_Yanton            bool
LastName_Yatters           bool
LastName_Yorkland          bool
LastName_Youngrayes        bool
Length: 2435, dtype: object

In [784]:
train = df[df["Transported"].notnull()]
test = df[df["Transported"].isnull()]

In [785]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=["Transported"])
y = train["Transported"]

mapping = { False: 0,True:1}
y = y.map(mapping)


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)


In [787]:
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],  # Including a smaller step around the default
    'n_estimators': [50, 100, 150, 300],  # Expanding the range around the default
    'max_depth': [3, 4, 5, 7],  # Adding intermediate depths
    'subsample': [0.8, 1],  # Subsampling of the training instances
    'colsample_bytree': [0.8, 1],  # Subsampling of columns for each tree
}

# Create the XGBClassifier model
model = XGBClassifier(random_state=0)

# Create the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the GridSearchCV object to the data

grid_search.fit(X_train, y_train)

model = grid_search.best_estimator_

y_val_pred = model.predict(X_val)

val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy}')


Fitting 5 folds for each of 192 candidates, totalling 960 fits
Validation Accuracy: 0.7987349051178838


In [789]:
best_model = model

X_test = test.drop(columns=["Transported"])

y_test = best_model.predict(X_test)

mapping = {0: False, 1: True}
y_test = pd.Series(y_test).map(mapping)

finaltest = pd.read_csv('test.csv')

y_test = pd.Series(y_test, name="Transported")
submission = pd.concat([finaltest["PassengerId"], y_test], axis=1)

submission.to_csv('submission.csv', index=False)

