In [None]:
pip install numpy pandas scikit-learn matplotlib seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('titanic_dataset.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
sns.heatmap(df.isnull(),cmap = 'magma',cbar = False);

In [None]:
df.isnull().sum()

In [None]:
duplicate_rows = df[df.duplicated()]
duplicate_rows

In [None]:
df.drop(['Cabin', 'PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [None]:
### convert the gender to binary 0 and 1
df['Sex']=df['Sex'].replace({'male':1,'female':0})
df.head()

In [None]:
df['Embarked']=df['Embarked'].replace({'S':1,'C':2,'Q':3})
df.head()

In [None]:
# Fill missing values in age column by imputing the median
df['Age'].fillna(df['Age'].median(), inplace=True)
df.isna().sum()

In [None]:
# Fill missing values in embarked column by imputing the mode
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)
df.isna().sum()

In [None]:
# Family size
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Is alone
df['IsAlone'] = 1  # default to alone
df.loc[df['FamilySize'] > 1, 'IsAlone'] = 0

# Age bins
df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, 80], labels=False)

# Fare bins
df['FareBin'] = pd.qcut(df['Fare'], 4, labels=False)


In [None]:
df.drop(['Age', 'Fare'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
# prompt: Do a Co-Relation analysis among the independent variables and drop variables if they are closely related. (Drop one column and retain another if the co-relation co-efficient is > +-.7)

# Calculate the correlation matrix
corr_matrix = df.corr()

# Create a boolean mask for the upper triangle of the correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find columns to drop based on the correlation threshold
to_drop = [column for column in upper.columns if any(abs(upper[column]) > 0.7)]

# Drop the identified columns
df_reduced = df.drop(to_drop, axis=1)

print("Original columns:", df.columns.tolist())
print("Columns to drop due to high correlation:", to_drop)
print("Remaining columns after dropping highly correlated ones:", df_reduced.columns.tolist())

# Display the correlation heatmap of the reduced dataframe
plt.figure(figsize=(10, 6))
sns.heatmap(df_reduced.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix after dropping highly correlated variables')
plt.show()

In [None]:
# prompt: Do standard / min-max scaling on numerical features. (Scale your variables so that no-single variables have more effect on the result)

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Identify numerical features for scaling (excluding the target variable 'Survived' if it exists)
numerical_features = df_reduced.select_dtypes(include=np.number).columns.tolist()
if 'Survived' in numerical_features:
    numerical_features.remove('Survived') # Assuming 'Survived' is the target

# Standard Scaling (Z-score normalization)
scaler_standard = StandardScaler()
df_standard_scaled = df_reduced.copy()
df_standard_scaled[numerical_features] = scaler_standard.fit_transform(df_standard_scaled[numerical_features])

print("DataFrame after Standard Scaling:")
print(df_standard_scaled.head())

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df_minmax_scaled = df_reduced.copy()
df_minmax_scaled[numerical_features] = scaler_minmax.fit_transform(df_minmax_scaled[numerical_features])

print("\nDataFrame after Min-Max Scaling:")
print(df_minmax_scaled.head())

# You can now use either df_standard_scaled or df_minmax_scaled for your downstream modeling.

In [219]:
# prompt: divide it into train test split validate

from sklearn.model_selection import train_test_split

# Assuming 'Survived' is your target variable
if 'Survived' in df_reduced.columns:
    X = df_reduced.drop('Survived', axis=1)
    y = df_reduced['Survived']
else:
    # Handle the case where 'Survived' is not in the reduced DataFrame
    # This might happen if 'Survived' was highly correlated with another feature and dropped
    # Or if 'Survived' was not intended to be the target in the reduced set
    # For now, let's assume you want to split the entire df_reduced if no target is specified
    X = df_reduced
    y = None
    print("Warning: 'Survived' column not found in the reduced dataframe. Splitting the entire dataframe.")


# Split data into training and test sets (e.g., 70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y if y is not None else None) # Use stratify if y is present and for classification tasks

# Split the training set further into training and validation sets (e.g., 80% train, 20% validation)
# This results in approximately 70% * 80% = 56% train, 70% * 20% = 14% validation, 30% test
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train if y_train is not None else None)

print("Shapes of the splits:")
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
if y is not None:
    print("y_train shape:", y_train.shape)
    print("y_val shape:", y_val.shape)
    print("y_test shape:", y_test.shape)

Shapes of the splits:
X_train shape: (498, 8)
X_val shape: (125, 8)
X_test shape: (268, 8)
y_train shape: (498,)
y_val shape: (125,)
y_test shape: (268,)


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# prompt: perform logistic regression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

In [None]:
# prompt: perform Naive Bayes

from sklearn.naive_bayes import GaussianNB

# Initialize and train the Gaussian Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Make predictions
y_pred_nb = nb_model.predict(X_test)

# Evaluate the model
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f'\nNaive Bayes Accuracy: {accuracy_nb:.2f}')

print('\nNaive Bayes Classification Report:')
print(classification_report(y_test, y_pred_nb))

print('\nNaive Bayes Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_nb))


In [None]:
# prompt: perform decision trees

from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f'\nDecision Tree Accuracy: {accuracy_dt:.2f}')

print('\nDecision Tree Classification Report:')
print(classification_report(y_test, y_pred_dt))

print('\nDecision Tree Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_dt))

In [None]:
# prompt: perform random forest

from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42) # n_estimators is the number of trees in the forest
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'\nRandom Forest Accuracy: {accuracy_rf:.2f}')

print('\nRandom Forest Classification Report:')
print(classification_report(y_test, y_pred_rf))

print('\nRandom Forest Confusion Matrix:')
print(confusion_matrix(y_test, y_pred_rf))

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [100, 200, 300],         # More trees can increase performance
#     'max_depth': [8, 12, 16, None],          # Allow deeper trees
#     'min_samples_split': [2, 3, 4],          # Don't over-regularize
#     'min_samples_leaf': [1, 2],              # Allow smaller leaves
#     'max_features': ['sqrt', 'log2'],        # Control feature selection per split
#     'bootstrap': [True]                      # Keep it true for now
# }

# # Set up Grid Search
# grid = GridSearchCV(
#     estimator=RandomForestClassifier(random_state=42),
#     param_grid=param_grid,
#     cv=5,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=2
# )

# # Fit to training data
# grid.fit(X_train, y_train)

# # Output results
# print("Best Parameters:", grid.best_params_)
# print("Best Cross-Validation Accuracy:", grid.best_score_)

# # Evaluate on test set
# best_model = grid.best_estimator_
# test_accuracy = best_model.score(X_test, y_test)
# print("Test Accuracy with tuned parameters:", test_accuracy)


In [218]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 150, 200],         # Balanced number of trees
    'max_depth': [10, 12, 14, 16],           # Slightly deeper trees (not too deep to avoid overfitting)
    'min_samples_split': [2, 4, 6],          # Regularization control
    'min_samples_leaf': [1, 2, 3],           # Minimum leaf size
    'max_features': ['sqrt', 'log2'],        # Feature subset strategies
    'bootstrap': [True]                      # Keep bootstrapping
}

# Set up Grid Search
grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit to training data
grid.fit(X_train, y_train)

# Output results
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)

# Evaluate on test set
best_model = grid.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy with tuned parameters:", test_accuracy)


Fitting 5 folds for each of 216 candidates, totalling 1080 fits


KeyboardInterrupt: 

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV

# # Expanded parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300, 500],
#     'max_depth': [5, 10, 15, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }

# # GridSearchCV with 5-fold cross-validation
# grid = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
#                     param_grid=param_grid,
#                     cv=5,
#                     scoring='accuracy',
#                     n_jobs=-1,  # use all processors
#                     verbose=2)

# grid.fit(X_train, y_train)

# print("Best Parameters:", grid.best_params_)
# print("Best Cross-Validation Score:", grid.best_score_)

# # Use best model to evaluate on test set
# best_model = grid.best_estimator_
# test_accuracy = best_model.score(X_test, y_test)
# print("Test Accuracy with tuned parameters:", test_accuracy)
