In [36]:
from google.colab import files
import pandas as pd

# Upload files
uploaded = files.upload()

# Load the dataset after uploading
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Display the first few rows of the train dataset to check the data
print(train_data.head())


Saving gender_submission.csv to gender_submission (3).csv
Saving test.csv to test (3).csv
Saving train.csv to train (3).csv
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250  

In [37]:
# Check for missing values in the training dataset
print(train_data.isnull().sum())

# Display data types of each column
print(train_data.info())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Non

In [38]:
# Fill missing Age values with the median
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

# Fill missing Embarked values with the most frequent value
train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median())  # Fill missing Fare in test set

# Encode 'Sex' (Male = 0, Female = 1)
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

# One-hot encode 'Embarked' (creates new columns: Embarked_C, Embarked_Q, Embarked_S)
train_data = pd.get_dummies(train_data, columns=['Embarked'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['Embarked'], drop_first=True)

# Create FamilySize and IsAlone features
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1  # +1 to include the passenger
train_data['IsAlone'] = 1  # Initialize to 1 (alone)
train_data.loc[train_data['FamilySize'] > 1, 'IsAlone'] = 0  # Set to 0 if FamilySize > 1

test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1  # Same for test set
test_data['IsAlone'] = 1
test_data.loc[test_data['FamilySize'] > 1, 'IsAlone'] = 0

# Extract titles from the Name column
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Simplify the titles
train_data['Title'] = train_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train_data['Title'] = train_data['Title'].replace(['Mlle', 'Ms'], 'Miss')
train_data['Title'] = train_data['Title'].replace('Mme', 'Mrs')

test_data['Title'] = test_data['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test_data['Title'] = test_data['Title'].replace(['Mlle', 'Ms'], 'Miss')
test_data['Title'] = test_data['Title'].replace('Mme', 'Mrs')

# Map the titles to numerical values
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
train_data['Title'] = train_data['Title'].map(title_mapping)
test_data['Title'] = test_data['Title'].map(title_mapping)

# Create Fare Bins
train_data['FareBin'] = pd.qcut(train_data['Fare'], 4, labels=False)
test_data['FareBin'] = pd.qcut(test_data['Fare'], 4, labels=False)

# Create 'HasCabin' feature
train_data['HasCabin'] = train_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
test_data['HasCabin'] = test_data['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)

# Drop unnecessary columns
train_data = train_data.drop(columns=['SibSp', 'Parch', 'Name', 'Fare', 'Ticket', 'Cabin', 'PassengerId'])
test_data = test_data.drop(columns=['SibSp', 'Parch', 'Name', 'Fare', 'Ticket', 'Cabin'])


In [39]:
from sklearn.model_selection import train_test_split

# Split features (X) and target (y)
X = train_data.drop(columns=['Survived'])  # Features
y = train_data['Survived']  # Target

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and validation sets
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)


(712, 10) (179, 10) (712,) (179,)


In [40]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [41]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

# Print evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


Accuracy: 0.7989
Confusion Matrix:
[[87 18]
 [18 56]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       105
           1       0.76      0.76      0.76        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [42]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Penalty types
    'solver': ['liblinear']   # Solvers that support l1 and l2 penalties
}

# Initialize the model
log_reg = LogisticRegression(max_iter=1000)

# Set up GridSearchCV
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")


Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Accuracy: 0.8230


In [43]:
# Retrain the model using the best hyperparameters
best_log_reg = LogisticRegression(C=100, penalty='l2', solver='liblinear', max_iter=1000)
best_log_reg.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_best = best_log_reg.predict(X_val)

# Evaluate the model
accuracy_best = accuracy_score(y_val, y_pred_best)
conf_matrix_best = confusion_matrix(y_val, y_pred_best)
class_report_best = classification_report(y_val, y_pred_best)

# Print evaluation metrics
print(f"Accuracy with best hyperparameters: {accuracy_best:.4f}")
print("Confusion Matrix:")
print(conf_matrix_best)
print("Classification Report:")
print(class_report_best)


Accuracy with best hyperparameters: 0.7989
Confusion Matrix:
[[87 18]
 [18 56]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83       105
           1       0.76      0.76      0.76        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [44]:
# Make predictions on the test data
test_predictions = best_log_reg.predict(test_data.drop(columns=['PassengerId']))

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],  # PassengerId must be included in the submission
    'Survived': test_predictions
})

# Save the submission file
submission.to_csv('titanic_submission.csv', index=False)

# Download the submission file for uploading to Kaggle
files.download('titanic_submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>