<a href="https://colab.research.google.com/github/sammyhasan17/Titanic_Logistical_Regression/blob/main/Logistical_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Let's import our libraries

In [30]:
# Sammy Hasan-Silva
# Machine Learning on Titanic Dataset

# Logistic regression is a type of statistical model used to analyze and classify data with one or more
# independent variables that may or may not be correlated with a binary or categorical outcome variable.
# The aim of logistic regression is to predict the probability of a particular outcome
# based on the values of the independent variables.
# use age & gender class to see if they survived

!pip install pydataset # install sample datasets and imports titanic
from pydataset import data

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split



# Lets see what our raw data looks like

In [19]:
# Get data
titanic = data('titanic')
print("before data prep:\n", titanic.sample(5))


before data prep:
           class     age    sex survived
648   3rd class  adults    man      yes
715   3rd class  adults    man       no
529   2nd class  adults  women      yes
1087  3rd class  adults  women      yes
149   1st class  adults    man       no


# How many passengers are in our dataset?

In [20]:
len(titanic)

1316

# Let's transform our data using Pandas to make it easier to read

In [44]:
# feature engineering/data preperation using pandas
titanic = pd.get_dummies(titanic, drop_first = True) # removes my first class
titanic.sample(5)

Unnamed: 0,class_2nd class,class_3rd class,age_child,sex_women,survived_yes
1097,False,True,False,True,True
108,False,False,False,False,False
1133,False,True,False,True,True
125,False,False,False,False,False
478,True,False,False,False,False


#  Now let's split our data into Training & Testing sets - using Numpy for speed

In [22]:
# Split data into train and test, default 25% is test set

X_train, X_test, y_train, y_test = train_test_split(
    titanic.drop('survived_yes', axis=1),
    titanic['survived_yes'],
    test_size=0.06,   # 6% test data
    random_state=42  # for reproducibility
)

# Combine features and target for easier viewing
test_sample = X_test.copy()
test_sample['survived_yes'] = y_test

print(test_sample.head(25))



      class_2nd class  class_3rd class  age_child  sex_women  survived_yes
199             False            False      False       True          True
559              True            False      False       True          True
372              True            False      False      False         False
290             False            False      False       True          True
185             False            False      False       True          True
740             False             True      False      False         False
343              True            False      False      False         False
1033            False             True      False      False         False
77              False            False      False      False         False
968             False             True      False      False         False
287             False            False      False       True          True
1195            False             True      False       True         False
1053            False    

# Let's feed our data into our Logistic Regression Model

In [36]:
# Convert pandas DataFrames to NumPy arrays for training
X_train_np = X_train.values
y_train_np = y_train.values

# Train the model
LogReg = LogisticRegression(solver='lbfgs')
LogReg.fit(X_train_np, y_train_np)

# Lets use ML to see predict who survived

In [37]:
# Predict if a class-1 child-age girl survived
LogReg.predict(np.array([[0,0,1,1]]))[0]

np.True_

In [38]:
# Predicting if a class-3 adult age male survived
LogReg.predict(np.array([[0,1,0,0]]))[0]

np.False_

How accurate is our model using test set?

In [43]:
# Convert test data to NumPy arrays
X_test_np = X_test.values
y_test_np = y_test.values

# Manual accuracy calculation using predict_proba and threshold 0.5
prediction = (LogReg.predict_proba(X_test_np)[:, 1] > 0.5).astype(int)
manual_accuracy = np.sum(prediction == y_test_np) / len(y_test_np)
print("manual_accuracy:", manual_accuracy)

# Built-in accuracy method (also uses 0.5 threshold internally)
builtin_accuracy = LogReg.score(X_test_np, y_test_np)
print("builtin_accuracy:", builtin_accuracy)


manual_accuracy: 0.810126582278481
builtin_accuracy: 0.810126582278481
