In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
gender_filr_url = r'https://raw.githubusercontent.com/sedeba19/Titanic-Dataset-from-Kaggle/main/data/gender_submission.csv'
test_file_url = r'https://raw.githubusercontent.com/sedeba19/Titanic-Dataset-from-Kaggle/main/data/test.csv'
train_file_url = r'https://raw.githubusercontent.com/sedeba19/Titanic-Dataset-from-Kaggle/main/data/train.csv'

In [3]:
gender_df = pd.read_csv(gender_filr_url)
test_df = pd.read_csv(test_file_url)
train_df = pd.read_csv(train_file_url)

In [4]:
train_df.shape, test_df.shape, gender_df.shape

((891, 12), (418, 11), (418, 2))

In [5]:
train_df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
# Drop the columns that has a higher value of null values
train_df.drop(columns=['Cabin'], inplace=True)
test_df.drop(columns=['Cabin'], inplace=True)

# Fill the column age with the mean value
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)



In [9]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [10]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [11]:
gender_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [12]:
# Split the train data into X and y variables
X_train = train_df.copy()
X_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C


In [13]:
y = X_train.pop('Survived')

In [14]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [15]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C
2,3,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,S
4,5,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,S
...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,S
889,890,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C


In [16]:
X_train.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [17]:
X_train.select_dtypes(include= 'number').columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [18]:
X_train.select_dtypes(include= 'object').columns

Index(['Name', 'Sex', 'Ticket', 'Embarked'], dtype='object')

In [19]:
X_train.select_dtypes(include= 'object').columns

Index(['Name', 'Sex', 'Ticket', 'Embarked'], dtype='object')

In [20]:
# Create a subset from train_df
X_train_subset = X_train[['Sex', 'Embarked']]
X_train_subset

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [21]:
# Get dummy variables for the categorical columns
cat_X_train = pd.get_dummies(X_train_subset)
cat_X_train

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1
3,1,0,0,0,1
4,0,1,0,0,1
...,...,...,...,...,...
886,0,1,0,0,1
887,1,0,0,0,1
888,1,0,0,0,1
889,0,1,1,0,0


In [22]:
X_train.select_dtypes(include= 'number').columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [23]:
# Normalize the numerical values
num_X_train = X_train.select_dtypes(include= 'number')
num_X_train

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1,3,22.000000,1,0,7.2500
1,2,1,38.000000,1,0,71.2833
2,3,3,26.000000,0,0,7.9250
3,4,1,35.000000,1,0,53.1000
4,5,3,35.000000,0,0,8.0500
...,...,...,...,...,...,...
886,887,2,27.000000,0,0,13.0000
887,888,1,19.000000,0,0,30.0000
888,889,3,29.699118,1,2,23.4500
889,890,1,26.000000,0,0,30.0000


In [24]:
# Use min-max scaler to normalize the numerical values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_X_train_scaled = scaler.fit_transform(num_X_train)
num_X_train_scaled

array([[0.        , 1.        , 0.27117366, 0.125     , 0.        ,
        0.01415106],
       [0.0011236 , 0.        , 0.4722292 , 0.125     , 0.        ,
        0.13913574],
       [0.00224719, 1.        , 0.32143755, 0.        , 0.        ,
        0.01546857],
       ...,
       [0.99775281, 1.        , 0.36792055, 0.125     , 0.33333333,
        0.04577135],
       [0.9988764 , 0.        , 0.32143755, 0.        , 0.        ,
        0.0585561 ],
       [1.        , 1.        , 0.39683338, 0.        , 0.        ,
        0.01512699]])

In [25]:
# Concate the numerical and categorical values
import numpy as np
X_train = np.concatenate((num_X_train_scaled, cat_X_train), axis=1)
X_train

array([[0.        , 1.        , 0.27117366, ..., 0.        , 0.        ,
        1.        ],
       [0.0011236 , 0.        , 0.4722292 , ..., 1.        , 0.        ,
        0.        ],
       [0.00224719, 1.        , 0.32143755, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.99775281, 1.        , 0.36792055, ..., 0.        , 0.        ,
        1.        ],
       [0.9988764 , 0.        , 0.32143755, ..., 1.        , 0.        ,
        0.        ],
       [1.        , 1.        , 0.39683338, ..., 0.        , 1.        ,
        0.        ]])

In [26]:
# Instantiate the model
rfc_model = RandomForestClassifier(n_estimators=100, 
                                   max_depth=5, 
                                   random_state=1)
rfc_model.fit(X_train, y)

In [27]:
test_df.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Embarked       0
dtype: int64

In [28]:
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

In [29]:
test_df.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [30]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [31]:
# test_df.select_dtypes(include= 'number').columns
num_X_test = test_df.select_dtypes(include= 'number')
num_X_test

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,892,3,34.50000,0,0,7.8292
1,893,3,47.00000,1,0,7.0000
2,894,2,62.00000,0,0,9.6875
3,895,3,27.00000,0,0,8.6625
4,896,3,22.00000,1,1,12.2875
...,...,...,...,...,...,...
413,1305,3,30.27259,0,0,8.0500
414,1306,1,39.00000,0,0,108.9000
415,1307,3,38.50000,0,0,7.2500
416,1308,3,30.27259,0,0,8.0500


In [32]:
# Use minmax scaler for the numerical values test data
num_X_test_scaled = scaler.fit_transform(num_X_test)
num_X_test_scaled


array([[0.        , 1.        , 0.4527232 , 0.        , 0.        ,
        0.01528158],
       [0.00239808, 1.        , 0.61756561, 0.125     , 0.        ,
        0.01366309],
       [0.00479616, 0.5       , 0.8153765 , 0.        , 0.        ,
        0.01890874],
       ...,
       [0.99520384, 1.        , 0.50547277, 0.        , 0.        ,
        0.01415106],
       [0.99760192, 1.        , 0.39697468, 0.        , 0.        ,
        0.01571255],
       [1.        , 1.        , 0.39697468, 0.125     , 0.11111111,
        0.0436405 ]])

In [33]:
# Get dummy variables for the categorical columns for test data
cat_X_test = pd.get_dummies(test_df[['Sex', 'Embarked']])
cat_X_test

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,1,0
1,1,0,0,0,1
2,0,1,0,1,0
3,0,1,0,0,1
4,1,0,0,0,1
...,...,...,...,...,...
413,0,1,0,0,1
414,1,0,1,0,0
415,0,1,0,0,1
416,0,1,0,0,1


In [34]:
# Concate the numerical and categorical values of test data
X_test = np.concatenate((num_X_test_scaled, cat_X_test), axis=1)
X_test

array([[0.        , 1.        , 0.4527232 , ..., 0.        , 1.        ,
        0.        ],
       [0.00239808, 1.        , 0.61756561, ..., 0.        , 0.        ,
        1.        ],
       [0.00479616, 0.5       , 0.8153765 , ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.99520384, 1.        , 0.50547277, ..., 0.        , 0.        ,
        1.        ],
       [0.99760192, 1.        , 0.39697468, ..., 0.        , 0.        ,
        1.        ],
       [1.        , 1.        , 0.39697468, ..., 1.        , 0.        ,
        0.        ]])

In [35]:
# Make predictions  on test data
rfc_model.predict(X_test)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [36]:
# Accuracy score
from sklearn.metrics import accuracy_score
acc_score = accuracy_score(gender_df['Survived'], rfc_model.predict(X_test))
acc_score

0.9569377990430622

In [37]:
# Create a submission file for kaggle
gender_df['Survived'] = rfc_model.predict(X_test)
gender_df
gender_df.to_csv('submission.csv', index=False)