In [362]:
import pandas as pd


In [363]:
# Load the dataset
data = pd.read_csv('train.csv')

In [364]:
# Check for duplicate rows in the Titanic dataset
duplicates = data.duplicated()

# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicates.sum()}")

# If there are duplicates, remove them
if duplicates.sum() > 0:
    data.drop_duplicates(inplace=True)
    print("Duplicates removed.")

# Verify if duplicates are removed
print(f"Number of duplicate rows after removal: {data.duplicated().sum()}")


Number of duplicate rows: 0
Number of duplicate rows after removal: 0


In [365]:
# Display the first few rows of the dataset
data.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1, inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


## 1. Exploring the Data (EDA):

In [366]:
# Cell: Check the shape of the dataset
data.shape


(891, 9)

In [367]:
# Cell: Get an overview of the data
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [368]:
# Cell: Summarize the dataset
data.describe()


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [369]:
# Cell: Check for missing values
data.isnull().sum()


Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [370]:
# Cell: Look at unique values in categorical columns
print(data['Sex'].unique())
print(data['Embarked'].unique())


['male' 'female']
['S' 'C' 'Q' nan]


In [371]:
# Convert 'Sex' column to numerical: 0 for male, 1 for female
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

# Fill missing values in 'Embarked' with the most common value (without inplace=True)
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])

# Convert 'Embarked' into one-hot encoding
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)

# Fill missing 'Age' values with the median (without inplace=True)
data['Age'] = data['Age'].fillna(data['Age'].median())


## 2.Feature Selection and Data Preparation

In [372]:
print(data.columns)


Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [373]:
# Drop the 'Name' column as it is not useful for prediction
data = data.drop(['Name'], axis=1)

# Features (X) and target (y)
X = data.drop('Survived', axis=1)  # Features are everything except 'Survived'
y = data['Survived']  # Target is the 'Survived' column

# Split data into training and testing sets (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 3. Model Selection and Training

In [374]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Scale the features
scaler = StandardScaler()
# Fit the scaler on the training data and transform it
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

# Transform the test data
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Initialize the model
model = LogisticRegression(max_iter=1000)

# Fit the model to the training data
model.fit(X_train, y_train)


In [375]:
# Make predictions
y_pred = model.predict(X_test_scaled)

In [376]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.68


# Evaluate Model Performance:

Confusion Matrix: Create a confusion matrix to visualize the performance of your model in detail, including true positives, true negatives, false positives, and false negatives.

In [377]:
from sklearn.metrics import confusion_matrix, classification_report

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Confusion Matrix:
 [[55 50]
 [ 7 67]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.52      0.66       105
           1       0.57      0.91      0.70        74

    accuracy                           0.68       179
   macro avg       0.73      0.71      0.68       179
weighted avg       0.76      0.68      0.68       179



### Feature Importance:

In [378]:
# Feature coefficients
importance = model.coef_[0]
feature_importance = pd.DataFrame(importance, index=X.columns, columns=["Coefficient"])
print(feature_importance)


            Coefficient
Pclass        -0.937896
Sex            2.590495
Age           -0.030605
SibSp         -0.295471
Parch         -0.108140
Fare           0.002579
Embarked_Q    -0.113437
Embarked_S    -0.399792


### Hyperparameter Tuning:

In [379]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'C': 1, 'solver': 'liblinear'}


### Cross-Validation:

In [380]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())


Cross-Validation Scores: [0.7877095  0.78651685 0.78651685 0.76966292 0.8258427 ]
Mean CV Score: 0.7912497646098802


### Model Deployment:

In [381]:
import joblib

# Save model
joblib.dump(model, 'titanic_model.pkl')


['titanic_model.pkl']

In [382]:
# Check class distribution in training data
import pandas as pd

# Assuming y_train is already defined and is a pandas Series
print("Class distribution in training data:")
print(y_train.value_counts())


Class distribution in training data:
Survived
0    444
1    268
Name: count, dtype: int64
