<a href="https://colab.research.google.com/github/sammyhasan17/Titanic_Logistical_Regression/blob/main/Logistical_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Let's import our libraries

In [1]:
# Sammy Hasan-Silva
# Machine Learning on Titanic Dataset

# Logistic regression is a type of statistical model used to analyze and classify data with one or more
# independent variables that may or may not be correlated with a binary or categorical outcome variable.
# The aim of logistic regression is to predict the probability of a particular outcome
# based on the values of the independent variables.
# use age & gender class to see if they survived

!pip install pydataset # install sample datasets and imports titanic
from pydataset import data

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

Collecting pydataset
  Downloading pydataset-0.2.0.tar.gz (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pydataset
  Building wheel for pydataset (setup.py) ... [?25l[?25hdone
  Created wheel for pydataset: filename=pydataset-0.2.0-py3-none-any.whl size=15939415 sha256=e97cc850894fc1efa8b6505e1b3004e9de88e78b6dcf8b969e030127d5fe1fb0
  Stored in directory: /root/.cache/pip/wheels/29/93/3f/af54c413cecaac292940342c61882d2a8848674175d0bb0889
Successfully built pydataset
Installing collected packages: pydataset
Successfully installed pydataset-0.2.0
initiated datasets repo at: /root/.pydataset/


# Lets see what our raw data looks like

In [61]:
# Get data
titanic = data('titanic')
print("before data prep:\n", titanic.sample(5))


before data prep:
          class     age    sex survived
691  3rd class  adults    man       no
865  3rd class  adults    man       no
522  2nd class  adults  women      yes
466  2nd class  adults    man       no
733  3rd class  adults    man       no


1316

In [62]:
# How many passengers are in our dataset?
len(titanic)

1316

# Let's transform our data to make it easier for our ML Model

In [60]:
# feature engineering/data preperation using pandas
titanic = pd.get_dummies(titanic, drop_first = True) # removes my first class
titanic.sample(5)

Unnamed: 0,class_2nd class,class_3rd class,age_child,sex_women,survived_yes
325,False,False,True,True,True
262,False,False,False,True,True
197,False,False,False,True,True
772,False,True,False,False,False
587,True,False,True,False,True


#  Now let's split our data into Training & Testing sets

In [51]:
# Split data into train and test, default 25% is test set

X_train, X_test, y_train, y_test = train_test_split(
    titanic.drop('survived_yes', axis=1),
    titanic['survived_yes'],
    test_size=0.06,   # 10% test data
    random_state=42  # for reproducibility
)


print(X_test, y_test)

      class_2nd class  class_3rd class  age_child  sex_women
199             False            False      False       True
559              True            False      False       True
372              True            False      False      False
290             False            False      False       True
185             False            False      False       True
740             False             True      False      False
343              True            False      False      False
1033            False             True      False      False
77              False            False      False      False
968             False             True      False      False
287             False            False      False       True
1195            False             True      False       True
1053            False             True      False      False
391              True            False      False      False
840             False             True      False      False
395              True   

# Let's feed our data into our Logistic Regression Model

In [52]:
# Train the model using the training data
LogReg = LogisticRegression(solver='lbfgs')
LogReg.fit(X_train, y_train)

# Lets use ML to see predict who survived

In [53]:
# Predict if a class-1 child-age girl survived
LogReg.predict(np.array([[0,0,1,1]]))[0]



np.True_

In [54]:
# Predicting if a class-3 adult age male survived
LogReg.predict(np.array([[0,1,0,0]]))[0]



np.False_

How accurate is our model?

In [55]:
# Manual accuracy calculation by applying a threshold to predicted probabilities
# (Use predict_proba to get probabilities, then classify with threshold 0.5)
prediction = (LogReg.predict_proba(X_test)[:, 1] > 0.5).astype(int)
manual_accuracy = np.sum(prediction == y_test) / len(y_test)
print("manual_accuracy: ", manual_accuracy)


# Built-in accuracy score method uses the model's default prediction (threshold 0.5 internally)
builtin_accuracy = LogReg.score(X_test, y_test)
print("builtin_accuracy ", builtin_accuracy)
# manual_accuracy and builtin_accuracy should match when using threshold 0.5


manual_accuracy:  0.810126582278481
builtin_accuracy  0.810126582278481
