<a href="https://www.kaggle.com/code/venkatasaigudisa/6-step-titanic?scriptVersionId=145538137" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# These imports will help us do most of the math required
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# This is just to filter out any warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# read in the data and get a quick, broad look at it

data = pd.read_csv('/kaggle/input/titanic/train.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


**This project will use a simple 6 step process to build a model for this data set**

1. Identify input and target columns
2. Impute missing values in numeric columns
3. Scale numeric values
4. Encode categorical values
5. Split dataset and build model
6. Test with testing dataset

In [4]:
# step 1 identify input and target columns

input_df = data[['Sex', 'Age', 'Cabin', 'Parch', 'Fare', 'SibSp', 'Pclass']]

numeric_cols = ['Age', 'Parch', 'Fare', 'SibSp', 'Pclass']
categorical_cols = ['Sex','Cabin']

targets = data['Survived']

In [5]:
# step 2 impute (fill in) any missing values in the numeric columns 
# with the columns' median

missing_counts = input_df[numeric_cols].isna().sum().sort_values(ascending=False)

imputer = SimpleImputer(strategy='median')
imputer.fit(data[numeric_cols])
input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])

input_df.describe().loc[['min', 'max']]

Unnamed: 0,Age,Parch,Fare,SibSp,Pclass
min,0.42,0.0,0.0,0.0,1.0
max,80.0,6.0,512.3292,8.0,3.0


In [6]:
# step 3 scale the numeric columns in order to 
# minimize the change in weight of the numbers

scaler = MinMaxScaler()
scaler.fit(data[numeric_cols])
input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])

input_df

Unnamed: 0,Sex,Age,Cabin,Parch,Fare,SibSp,Pclass
0,male,0.271174,,0.000000,0.014151,0.125,1.0
1,female,0.472229,C85,0.000000,0.139136,0.125,0.0
2,female,0.321438,,0.000000,0.015469,0.000,1.0
3,female,0.434531,C123,0.000000,0.103644,0.125,0.0
4,male,0.434531,,0.000000,0.015713,0.000,1.0
...,...,...,...,...,...,...,...
886,male,0.334004,,0.000000,0.025374,0.000,0.5
887,female,0.233476,B42,0.000000,0.058556,0.000,0.0
888,female,0.346569,,0.333333,0.045771,0.125,1.0
889,male,0.321438,C148,0.000000,0.058556,0.000,0.0


In [7]:
# step 4 encode categorical data using OneHotEncoder

input_df['cabin_let'] = data.Cabin.apply(lambda x: str(x)[0] if str(x)[0] is not 'T' else 'n')

categorical_cols = categorical_cols[0:1] + ['cabin_let']

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(input_df[categorical_cols])

encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
# encoded_cols.remove('cabin_let_T')
input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])

encoded_cols

['Sex_female',
 'Sex_male',
 'cabin_let_A',
 'cabin_let_B',
 'cabin_let_C',
 'cabin_let_D',
 'cabin_let_E',
 'cabin_let_F',
 'cabin_let_G',
 'cabin_let_n']

In [8]:
# step 5 split data into train and validation sets then 
# create, fit, and score a logistic regression model for the data

X_train, X_test, Y_train, Y_test = train_test_split(input_df[numeric_cols + encoded_cols], 
                                                    targets, test_size=0.25, random_state = 42)
logreg = LogisticRegression().fit(X_train, Y_train)
score = logreg.score(X_test, Y_test) * 100

print('The model has a score of {score:.2f}%'.format(score=score))

The model has a score of 78.92%


In [9]:
# step 6, test the testing dataset

def tester(file_name, test_name):
    test = pd.read_csv(file_name)
    test_df = test[['Sex', 'Age', 'Cabin', 'Parch', 'Fare', 'SibSp', 'Pclass']]
    test_df['cabin_let'] = test.Cabin.apply(lambda x: str(x)[0])
    
    imputer = SimpleImputer(strategy='median')
    imputer.fit(test[numeric_cols])
    test_df[numeric_cols] = imputer.transform(test_df[numeric_cols])
    
    scaler = MinMaxScaler()
    scaler.fit(test[numeric_cols])
    test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])
    
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoder.fit(test_df[categorical_cols])

    encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
    test_df[encoded_cols] = encoder.transform(test_df[categorical_cols])
    
    score_test = pd.read_csv(test_name)
    scores = score_test['Survived'].to_numpy()
    
    return logreg.score(test_df[numeric_cols + encoded_cols], scores)
    
test_score = tester('/kaggle/input/titanic/test.csv', '/kaggle/input/titanic/gender_submission.csv')
print('The model has a{test_score: .2f}% success rate on the testing dataset'.format(test_score=test_score*100))

The model has a 96.41% success rate on the testing dataset
