In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier 

In [3]:
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')

In [4]:
print("train dataset dimension:",train_df.shape)
print("test dataset dimension:",test_df.shape)

train dataset dimension: (891, 12)
test dataset dimension: (418, 11)


In [5]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

In [8]:
# Fill missing embarked values with most common value
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

In [9]:
# Fill missing fare values in test set with median
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True) 

In [10]:
# Drop 'Cabin' column as it has too many missing values
train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)


In [11]:
train_df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [12]:
test_df.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [13]:
# Convert categorical features to numeric (Label Encoding)
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])
test_df['Sex'] = le.transform(test_df['Sex'])

train_df['Embarked'] = le.fit_transform(train_df['Embarked'])
test_df['Embarked'] = le.transform(test_df['Embarked'])

In [14]:
# Select features and target variable
X = train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = train_df['Survived']

In [15]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Dictionary to store accuracy of each model
model_objects = {}

In [17]:
# 1. Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)
model_objects['Logistic Regression'] = lr  # Store the model object


In [18]:
# 2. Support Vector Machine
svc = SVC()
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_val)
model_objects['SVM'] = svc  # Store the model object

In [19]:
# 3. K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_val)
model_objects['KNN'] = knn  # Store the model object

In [20]:
# 4. Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
model_objects['Random Forest'] = rf  # Store the model object

In [21]:
# 5. Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
model_objects['Gradient Boosting'] = gb  # Store the model object

In [23]:
# Print model accuracy scores
print("Model Performance Comparison:")
for model, acc in model_objects.items():
    print(f"{model}: {acc * 100:.2f}%")

Model Performance Comparison:


TypeError: unsupported operand type(s) for *: 'LogisticRegression' and 'int'

In [24]:
# Find the best model by accuracy
best_model_name = max(model_accuracy, key=model_accuracy.get)
best_model = model_objects[best_model_name]  # Get the actual model object

NameError: name 'model_accuracy' is not defined

In [42]:
# Ensure the test dataset has the same features as the training set
test_df_clean = test_df.drop(['Name', 'PassengerId', 'Ticket'], axis=1)

# Ensure columns match the training set
test_df_clean = test_df_clean.reindex(columns=X_train.columns, fill_value=0)

# Use the best model for predictions on the test set
test_predictions = best_model.predict(test_df_clean)


In [44]:
# Create the submission file
submission = pd.DataFrame({'PassengerId': pd.read_csv('Data/test.csv')['PassengerId'], 'Survived': test_predictions})
submission.to_csv('titanic_submission.csv', index=False)

print(f"Submission file created using the best model: {best_model_name}")

Submission file created using the best model: Logistic Regression


In [1]:
submit = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': test_predictions})
submit.head()

NameError: name 'pd' is not defined