In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the passenger data
passengers = pd.read_csv("passengers.csv")
# print(passengers)


# Clean the Data

passengers['Sex'] = passengers['Sex'].map({'female': 1, 'male': 0}) # Updating sex column to numerical
# print(passengers.head())

# print(passengers['Age'].values)

passengers['Age'].fillna(value=passengers['Age'].mean(), inplace=True) # filling the nan values in the age column with the mean age

# print(passengers['Age'].values)
# print(passengers)

passengers['FirstClass']  = passengers['Pclass'].apply(lambda x: 1 if x == 1 else 0)
# print(passengers.head())

passengers['SecondClass']  = passengers['Pclass'].apply(lambda x: 1 if x == 2 else 0)
# print(passengers.head(10))


# Select and Split the Data

features =  passengers[['Sex','Age', 'FirstClass', 'SecondClass']]
survival = passengers['Survived']

X_train, X_test, y_train, y_test = train_test_split(features, survival, test_size = 0.2)


# Normalize the Data

scaler = StandardScaler()
train_features = scaler.fit_transform(X_train)
test_features = scaler.transform(X_test)


# Create and Evaluate the Model

model = LogisticRegression()
model.fit(X_train, y_train)

train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print('The coefficient of determination for the train data is', train_score)
print('The coefficient of determination for the test data is', test_score)

print('The coefficients for the model are', model.coef_) # printing the coefficients of the model

# printing each feature with its respective coefficient value 
print(list(zip(['Sex','Age','FirstClass','SecondClass'], model.coef_[0]))) 


# Predict with the Model

Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You = np.array([1.0,25.0,1.0,0.0])

sample_passengers = np.array([Jack, Rose, You])

sample_passengers = scaler.transform(sample_passengers)

print(model.predict(sample_passengers))

print(model.predict_proba(sample_passengers))



The coefficient of determination for the train data is 0.800561797752809
The coefficient of determination for the test data is 0.7821229050279329
The coefficients for the model are [[ 2.5800937  -0.02120746  2.05544805  1.00959032]]
[('Sex', 2.5800936964569376), ('Age', -0.02120745934686665), ('FirstClass', 2.055448052839403), ('SecondClass', 1.009590321991327)]
[0 1 1]
[[0.99461315 0.00538685]
 [0.0065098  0.9934902 ]
 [0.00659468 0.99340532]]
