In [None]:
#import necessary libraries
import pandas as pd  # For data manipulation and analysis
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For statistical data visualization
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

Training Data exploration

In [None]:
trainTitanicData = pd.read_csv('../data/train.csv')
print("The first few rows of the data")
print(trainTitanicData.head())

In [None]:
trainMissingData = trainTitanicData.isnull().sum()
print("\nMissing values in each column:")
print(trainMissingData)

In [None]:
#Display summary statistics
trainSummaryStats = trainTitanicData.describe()
print("\nSummary stats for numeric features")
print(trainSummaryStats)

In [None]:
#visualise data
plt.figure(figsize=(10,6))
sns.boxplot(data=trainTitanicData.select_dtypes(include=['float64', 'int64'])) # Create boxplots for numerical features
plt.title("Boxplots for numerical columns")
plt.show()

 Training Data Preprocessing

In [None]:
#Make a copy of teh original data
trainTitanicData = trainTitanicData.copy()

#Handle missing values
#Replace missing Age values with the median age value
trainTitanicData['Age'].fillna(trainTitanicData['Age'].median(), inplace=True)

#Replace missing Embarked values with the most common value
trainTitanicData['Embarked'].fillna(trainTitanicData['Embarked'].mode()[0], inplace=True)

#Cabin has many missing values so I will drop that column
trainTitanicData.drop('Cabin',axis=1,inplace=True)
#Check if there are still missing values
print("Confirming the number of missing values")
print(trainTitanicData.isnull().sum())

In [None]:
#Converting Categorial Data to numerical
trainTitanicData = pd.get_dummies(trainTitanicData, columns=['Sex', 'Embarked'],drop_first = True)
print(trainTitanicData.head())


In [77]:
#Splitting data into features and labels
X = trainTitanicData.drop('Survived', axis = 1)
Y = trainTitanicData['Survived']

#Splitting training data into training and validation
xTrain, xVal, yTrain, yVal =  train_test_split(X,Y,test_size=0.2,random_state=42)

Test Data preprocessing

In [None]:
#Load the test data
testTitanicData = pd.read_csv('../data/test.csv')
print("Overview of the test data")
print(testTitanicData.head())

In [None]:
# Check for missing data
testMissingData = testTitanicData.isnull().sum()
print("Missing data in each column for the test data")
print(testMissingData)

In [None]:
#Display summary statistics
testSummaryStats = testTitanicData.describe()
print("\nSummary stats for numeric features")
print(testSummaryStats)

In [None]:
#visualise data
plt.figure(figsize=(10,6))
sns.boxplot(data=testTitanicData.select_dtypes(include=['float64', 'int64'])) # Create boxplots for numerical features
plt.title("Boxplots for numerical columns")
plt.show()

In [None]:
#Make a copy of teh original data
testTitanicData = testTitanicData.copy()

#Handle missing values
#Replace missing Age values with the median age value
testTitanicData['Age'].fillna(testTitanicData['Age'].median(), inplace=True)

#Replace missing Fare values with the most common value
testTitanicData['Fare'].fillna(testTitanicData['Fare'].median(), inplace=True)

#Cabin was dropped in training so drop it here in test too, it also has a lot of missing values
testTitanicData.drop('Cabin',axis=1,inplace=True)
#Check if there are still missing values
print("Confirming the number of missing values")
print(testTitanicData.isnull().sum())

In [None]:
testTitanicData = pd.get_dummies(testTitanicData, columns=['Sex', 'Embarked'],drop_first = True)
print(testTitanicData.head())