In [1]:
# Step 1: Import pandas library
import pandas as pd
import numpy as np

# Step 2: Load the CSV file into a DataFrame
# Assuming the file is named 'train.csv' and it's in the same directory as your notebook
titanic_data = pd.read_csv('train.csv')

# Step 3: Display the first few rows to check the data
titanic_data.head(6)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q


In [2]:
titanic_data.isnull().sum()[titanic_data.isnull().sum() > 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

In [3]:
titanic_data['Age'] = titanic_data['Age'].fillna(titanic_data['Age'].median())

In [4]:
cabin_missing_percentage = titanic_data['Cabin'].isnull().sum() / len(titanic_data) * 100

In [5]:
cabin_missing_percentage

77.10437710437711

In [6]:
titanic_data.drop(columns=['Cabin'], inplace=True)

In [7]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [8]:
titanic_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [9]:
# Step 1: Identify missing values in each column
missing_values = titanic_data.isnull().sum()

# Step 2: Filter columns that have missing values
missing_values = missing_values[missing_values > 0]

print(missing_values)

Embarked    2
dtype: int64


In [10]:
# Step 1: Find the most frequent value (mode) in the 'Embarked' column
most_frequent_port = titanic_data['Embarked'].mode()[0]

# Step 2: Fill missing values in the 'Embarked' column with the most frequent value
titanic_data['Embarked'] = titanic_data['Embarked'].fillna(most_frequent_port)

# Step 3: Verify that the missing values in 'Embarked' are handled
print(titanic_data['Embarked'].isnull().sum())  # Should print 0 if all missing values are filled

0


In [11]:
print(titanic_data.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [12]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [13]:
# One-hot encode the 'Embarked' and 'Sex' columns
titanic_data_encoded = pd.get_dummies(titanic_data, columns=['Embarked', 'Sex'], drop_first=True)

In [14]:
titanic_data_encoded.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S,Sex_male
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,False,True,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,False,True,False
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,False,True,False
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,False,True,True


In [15]:
# Create new features 'FamilySize' and 'IsAlone' directly in the encoded DataFrame
titanic_data_encoded['FamilySize'] = titanic_data_encoded['SibSp'] + titanic_data_encoded['Parch'] + 1

# Create 'IsAlone' feature based on 'FamilySize'
titanic_data_encoded['IsAlone'] = (titanic_data_encoded['FamilySize'] == 1).astype(int)

# Display the first few rows to verify the new features
print(titanic_data_encoded[['SibSp', 'Parch', 'FamilySize', 'IsAlone']].head())

   SibSp  Parch  FamilySize  IsAlone
0      1      0           2        0
1      1      0           2        0
2      0      0           1        1
3      1      0           2        0
4      0      0           1        1


In [16]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target variable (y)
X = titanic_data_encoded.drop(columns=['Survived', 'PassengerId', 'Name', 'Ticket'])
y = titanic_data_encoded['Survived']

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=3)

# Check the shapes to confirm the split
print(f'Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}')

Training data shape: (712, 10), Validation data shape: (179, 10)


In [17]:
X_val

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male,FamilySize,IsAlone
395,3,22.0,0,0,7.7958,False,True,True,1,1
85,3,33.0,3,0,15.8500,False,True,False,4,0
201,3,28.0,8,2,69.5500,False,True,True,11,0
542,3,11.0,4,2,31.2750,False,True,False,7,0
702,3,18.0,0,1,14.4542,False,False,False,2,0
...,...,...,...,...,...,...,...,...,...,...
840,3,20.0,0,0,7.9250,False,True,True,1,1
134,2,25.0,0,0,13.0000,False,True,True,1,1
792,3,28.0,8,2,69.5500,False,True,False,11,0
592,3,47.0,0,0,7.2500,False,True,True,1,1


In [18]:
# Log transform the Fare column (log(1 + Fare) to handle zero values)
X_train['LogFare'] = np.log1p(X_train['Fare'])
X_val['LogFare'] = np.log1p(X_val['Fare'])

In [19]:
# Interaction between Age and Sex
X_train['Age_Sex'] = X_train['Age'] * X_train['Sex_male']
X_val['Age_Sex'] = X_val['Age'] * X_val['Sex_male']

# Interaction between Pclass and LogFare
X_train['Pclass_LogFare'] = X_train['Pclass'] * X_train['LogFare']
X_val['Pclass_LogFare'] = X_val['Pclass'] * X_val['LogFare']

In [20]:
# Step 1: Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Step 1: Import the StandardScaler from sklearn
from sklearn.preprocessing import StandardScaler

# Step 2: Initialize the scaler


# Scale Age, LogFare, Age_Sex, Pclass_LogFare, and the additional features
scaler = StandardScaler()
X_train[['Age', 'LogFare', 'Age_Sex', 'Pclass_LogFare', 'Pclass', 'SibSp', 'Parch', 'FamilySize', 'IsAlone']] = scaler.fit_transform(
    X_train[['Age', 'LogFare', 'Age_Sex', 'Pclass_LogFare', 'Pclass', 'SibSp', 'Parch', 'FamilySize', 'IsAlone']])
X_val[['Age', 'LogFare', 'Age_Sex', 'Pclass_LogFare', 'Pclass', 'SibSp', 'Parch', 'FamilySize', 'IsAlone']] = scaler.transform(
    X_val[['Age', 'LogFare', 'Age_Sex', 'Pclass_LogFare', 'Pclass', 'SibSp', 'Parch', 'FamilySize', 'IsAlone']])


# Step 5: Proceed with model training as before
# Initialize Logistic Regression with L2 regularization (default)
log_reg = LogisticRegression(penalty='l2', C=0.1, max_iter=1000, random_state=42)

# Train the model using the training set
log_reg.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = log_reg.predict(X_val)

# Evaluate the model using accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')

# Print the confusion matrix to see performance in detail
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_val, y_pred))

Validation Accuracy: 0.7877
Confusion Matrix:
[[84 25]
 [13 57]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.77      0.82       109
           1       0.70      0.81      0.75        70

    accuracy                           0.79       179
   macro avg       0.78      0.79      0.78       179
weighted avg       0.80      0.79      0.79       179



In [21]:
import pandas as pd

# Load the test dataset
test_data = pd.read_csv('test.csv')

# Check the first few rows of the test data to understand its structure
print(test_data.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


In [22]:
# Fill missing values in the 'Age' column with the median of the training set
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

# Fill missing values in the 'Fare' column with the median of the training set
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].median())

# Drop the 'Cabin' column since it has too many missing values
test_data.drop(columns=['Cabin'], inplace=True)

# Check again to ensure missing values are handled
print(test_data.isnull().sum())

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [23]:
# Perform one-hot encoding on 'Sex' and 'Embarked' columns
test_data_encoded = pd.get_dummies(test_data, columns=['Sex', 'Embarked'], drop_first=True)

# Check the first few rows to ensure one-hot encoding was applied correctly
print(test_data_encoded.head())

   PassengerId  Pclass                                          Name   Age  \
0          892       3                              Kelly, Mr. James  34.5   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  47.0   
2          894       2                     Myles, Mr. Thomas Francis  62.0   
3          895       3                              Wirz, Mr. Albert  27.0   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  22.0   

   SibSp  Parch   Ticket     Fare  Sex_male  Embarked_Q  Embarked_S  
0      0      0   330911   7.8292      True        True       False  
1      1      0   363272   7.0000     False       False        True  
2      0      0   240276   9.6875      True        True       False  
3      0      0   315154   8.6625      True       False        True  
4      1      1  3101298  12.2875     False       False        True  


In [24]:
# Create new features 'FamilySize' and 'IsAlone'
test_data_encoded['FamilySize'] = test_data_encoded['SibSp'] + test_data_encoded['Parch'] + 1
test_data_encoded['IsAlone'] = (test_data_encoded['FamilySize'] == 1).astype(int)

# Apply log transformation to 'Fare' column
test_data_encoded['LogFare'] = np.log1p(test_data_encoded['Fare'])

# Create interaction terms
test_data_encoded['Age_Sex'] = test_data_encoded['Age'] * test_data_encoded['Sex_male']
test_data_encoded['Pclass_LogFare'] = test_data_encoded['Pclass'] * test_data_encoded['LogFare']

# Check the first few rows to verify the new features
print(test_data_encoded[['FamilySize', 'IsAlone', 'LogFare', 'Age_Sex', 'Pclass_LogFare']].head())

   FamilySize  IsAlone   LogFare  Age_Sex  Pclass_LogFare
0           1        1  2.178064     34.5        6.534193
1           2        0  2.079442      0.0        6.238325
2           1        1  2.369075     62.0        4.738150
3           1        1  2.268252     27.0        6.804757
4           3        0  2.586824      0.0        7.760471


In [25]:
from sklearn.preprocessing import StandardScaler

# List of features to be scaled
features_to_scale = ['Age', 'LogFare', 'Age_Sex', 'Pclass_LogFare', 'Pclass', 'SibSp', 'Parch', 'FamilySize', 'IsAlone']

# Initialize the scaler (assuming you already fitted the scaler on the training data)
# If the scaler hasn't been fitted yet, ensure it's fitted on training data first
X_test_scaled = test_data_encoded.copy()
X_test_scaled[features_to_scale] = scaler.transform(test_data_encoded[features_to_scale])

# Check the first few rows to verify scaling
print(X_test_scaled[features_to_scale].head())

        Age   LogFare   Age_Sex  Pclass_LogFare    Pclass     SibSp     Parch  \
0  0.351842 -0.783718  0.794254        0.148276  0.838307 -0.495084 -0.440919   
1  1.295177 -0.883805 -1.126690        0.005740  0.838307  0.514944 -0.440919   
2  2.427178 -0.589871  2.325440       -0.716979 -0.348322 -0.495084 -0.440919   
3 -0.214159 -0.692191  0.376657        0.278622  0.838307 -0.495084 -0.440919   
4 -0.591493 -0.368889 -1.126690        0.739043  0.838307  0.514944  0.819862   

   FamilySize   IsAlone  
0   -0.558229  0.790930  
1    0.106418 -1.264334  
2   -0.558229  0.790930  
3   -0.558229  0.790930  
4    0.771066 -1.264334  


In [27]:
# Define the list of features used during model training
features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S', 'Sex_male', 'FamilySize', 'IsAlone', 'LogFare', 'Age_Sex', 'Pclass_LogFare']

# Ensure the test data contains only these features
X_test_scaled = X_test_scaled[features]

# Now make predictions using the logistic regression model
test_predictions = log_reg.predict(X_test_scaled)

# Check the first few predictions
print(test_predictions[:10])  # This will print the first 10 predictions

[0 1 0 0 1 0 1 0 1 0]


In [28]:
# Create a DataFrame with 'PassengerId' and 'Survived' (predictions)
submission = pd.DataFrame({
    'PassengerId': test_data_encoded['PassengerId'],  # Passenger IDs from the original test data
    'Survived': test_predictions  # Predictions from the model
})

# Save the submission file as a CSV
submission.to_csv('submission.csv', index=False)

# Check the first few rows of the submission file to ensure it's correct
print(submission.head())

   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
