# **1.Import Libraries**

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np

In [2]:
# upload the test dataset
from google.colab import files
uploaded=files.upload()

Saving test.csv to test (2).csv


In [3]:
import io
test_df=pd.read_csv(io.StringIO(uploaded['test.csv'].decode('utf-8')))
# show the test dataset
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# **2. Data Preprocessing**

In [4]:
# shape of test dataset
test_df.shape

(418, 11)

In [5]:
# information about test dataset
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [6]:
# check for null value
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
# fill the null value
test_df['Age']=test_df['Age'].fillna(test_df['Age'].median())
test_df['Fare']=test_df['Fare'].fillna(test_df['Fare'].mean())

In [8]:
# drop the unnecessary column
test_df.drop(columns=["Cabin","Name","Ticket","PassengerId"],axis=1,inplace=True )

In [9]:
# create a new column 
test_df["Family_Member"]=test_df['Parch']+test_df['SibSp']

In [10]:
# drop columns "SibSp" & "Parch"
test_df.drop(columns=["SibSp","Parch"],axis=1,inplace=True )

In [11]:
# check for null values
test_df.isnull().sum()

Pclass           0
Sex              0
Age              0
Fare             0
Embarked         0
Family_Member    0
dtype: int64

In [12]:
test_df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family_Member
0,3,male,34.5,7.8292,Q,0
1,3,female,47.0,7.0,S,1
2,2,male,62.0,9.6875,Q,0
3,3,male,27.0,8.6625,S,0
4,3,female,22.0,12.2875,S,2


In [13]:
# separate columns having categorical values
obj_test_df=test_df.select_dtypes(include=['object']).copy()
obj_test_df.head()

Unnamed: 0,Sex,Embarked
0,male,Q
1,female,S
2,male,Q
3,male,S
4,female,S


In [14]:
# convert categorical values into numeric values
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in obj_test_df.columns:
    test_df[col]=le.fit_transform(test_df[col])

In [15]:
test_df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family_Member
0,3,1,34.5,7.8292,1,0
1,3,0,47.0,7.0,2,1
2,2,1,62.0,9.6875,1,0
3,3,1,27.0,8.6625,2,0
4,3,0,22.0,12.2875,2,2


# **3.Upload trained model**

In [16]:
# load the exist train model
import joblib
loaded_model=joblib.load('xgboost_titanic.pkl')

# **4. Start Prediction**

In [17]:
# predict the values
pred_y=loaded_model.predict(test_df)

In [18]:
# prediction dataset
pred_y

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [19]:
# create a new column & upload the prediction data
test_df["Survived"]=pred_y
test_df["Survived"]=test_df["Survived"].astype(int)

In [20]:
test_df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family_Member,Survived
0,3,1,34.5,7.8292,1,0,0
1,3,0,47.0,7.0,2,1,0
2,2,1,62.0,9.6875,1,0,0
3,3,1,27.0,8.6625,2,0,0
4,3,0,22.0,12.2875,2,2,1


In [21]:
# drop all columns without "Srvived" column
test_df.drop(columns=['Pclass','Sex','Age','Fare','Embarked','Family_Member'],axis=1,inplace=True)

In [22]:
test_df.head()

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,1


# **5. Create CSV file**

In [23]:
# convert the dataframe into csv file
test_df.to_csv("final_test.csv")