In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
import seaborn as sns
from scipy.special import softmax
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import requests

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
response = requests.get(url)
with open("titanic.csv", "wb") as file:
    file.write(response.content)

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
print(titanic.head())
print(titanic.info())
print(titanic.describe())
print(titanic.shape)

In [None]:
titanic = pd.get_dummies(titanic, columns=['Sex'], prefix='Sex')
titanic['single'] = titanic.apply(lambda row: 0 if row['SibSp'] + row['Parch'] > 0 else 1, axis=1)
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())



In [None]:
titanic = titanic.drop(['PassengerId', 'Name', 'Ticket','Cabin', 'Embarked', 'SibSp', 'Parch'], axis=1)

In [None]:
corr_matrix = titanic.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap=plt.cm.Reds)
plt.show()


In [None]:

x = np.array([[titanic['Survived']],
             [titanic['single']],
             [titanic['Age']],
             [titanic['Fare']],
             [titanic['Pclass']],
             [titanic['Sex_female']],
             [titanic['Sex_male']]],
             )

m = softmax(x)

print(m)

In [None]:

# Example: Plotting the distribution of the 'Age' column
plt.hist(titanic['Survived'], bins=2, color='blue', edgecolor='black', alpha=0.7)
plt.title('Distribution')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [None]:
#- Dealing with Imbalanced Classes
X = titanic.drop(columns=['Survived'])
y = titanic['Survived']

ros = RandomOverSampler()
rus = RandomUnderSampler()
smote = SMOTE()


X_ros, y_ros = ros.fit_resample(X, y)
X_rus, y_rus = rus.fit_resample(X, y)
X_smote, y_smote = smote.fit_resample(X, y)

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

In [None]:
# Define the pipelines for each model
linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('linear', LinearRegression())
])

lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso())
])

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

elasticnet_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('elasticnet', ElasticNet())
])

In [None]:
# Define the hyperparameters to search over for each model
grid_linear_params = {
    'linear__fit_intercept': [True, False]
}

grid_lasso_params = {
    'lasso__alpha': [0.1, 1, 10],
    'lasso__max_iter': [100, 1000, 10000]
}

grid_ridge_params = {
    'ridge__alpha': [0.1, 1, 10],
    'ridge__max_iter': [100, 1000, 10000]
}

grid_elasticnet_params = {
    'elasticnet__alpha': [0.1, 1, 10],
    'elasticnet__l1_ratio': [0.1, 0.5, 0.9],
    'elasticnet__max_iter': [100, 1000, 10000]
}

In [None]:
# Perform hyperparameter tuning with GridSearchCV
grid_search_linear = GridSearchCV(linear_pipeline, grid_linear_params, cv=5, n_jobs=-1)
grid_search_lasso = GridSearchCV(lasso_pipeline, grid_lasso_params, cv=5, n_jobs=-1)
grid_search_ridge = GridSearchCV(ridge_pipeline, grid_ridge_params, cv=5, n_jobs=-1)
grid_search_elasticnet = GridSearchCV(elasticnet_pipeline, grid_elasticnet_params, cv=5, n_jobs=-1)

grid_search_linear.fit(X_trainval, y_trainval)
grid_search_lasso.fit(X_trainval, y_trainval)
grid_search_ridge.fit(X_trainval, y_trainval)
grid_search_elasticnet.fit(X_trainval, y_trainval)

In [None]:
# Define the hyperparameters to search over for each model
rand_lasso_params = {
    'lasso__alpha': np.arange(0, 10, 0.01),
}

rand_ridge_params = {
    'ridge__alpha': np.arange(0, 10, 0.01),
    'ridge__max_iter': [10, 100, 1000, 10000]
}

rand_elasticnet_params = {
    'elasticnet__alpha': np.arange(0, 10, 0.01),
    'elasticnet__l1_ratio': np.arange(0, 1, 0.01),
    'elasticnet__max_iter': [10, 100, 1000, 10000]
}

In [None]:
# Perform hyperparameter tuning with RandomizedSearchCV
random_search_lasso = RandomizedSearchCV(lasso_pipeline, rand_lasso_params, cv=5, n_jobs=-1, random_state=42)
random_search_ridge = RandomizedSearchCV(ridge_pipeline, rand_ridge_params, cv=5, n_jobs=-1, random_state=42)
random_search_elasticnet = RandomizedSearchCV(elasticnet_pipeline, rand_elasticnet_params, cv=5, n_jobs=-1, random_state=42)

random_search_lasso.fit(X_trainval, y_trainval)
random_search_ridge.fit(X_trainval, y_trainval)
random_search_elasticnet.fit(X_trainval, y_trainval)

In [None]:
# Get the best model and its parameters from GridSearchCV
best_model_grid_linear = grid_search_linear.best_estimator_

best_model_lasso = grid_search_lasso.best_estimator_
best_params_lasso = grid_search_lasso.best_params_

best_model_ridge = grid_search_ridge.best_estimator_
best_params_ridge = grid_search_ridge.best_params_

best_model_elasticnet = grid_search_elasticnet.best_estimator_
best_params_elasticnet = grid_search_elasticnet.best_params_

In [None]:
# Get the best model and its parameters from RandomizedSearchCV
best_model_random_lasso = random_search_lasso.best_estimator_
best_params_random_lasso = random_search_lasso.best_params_

best_model_random_ridge = random_search_ridge.best_estimator_
best_params_random_ridge = random_search_ridge.best_params_

best_model_random_elasticnet = random_search_elasticnet.best_estimator_
best_params_random_elasticnet = random_search_elasticnet.best_params_

In [None]:
best_model_random_elasticnet

In [142]:
# Evaluate the models on the test set
y_pred_linear = best_model_grid_linear.predict(X_test)

y_pred_lasso = best_model_lasso.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

y_pred_ridge = best_model_ridge.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

y_pred_elasticnet = best_model_elasticnet.predict(X_test)
mse_elasticnet = mean_squared_error(y_test, y_pred_elasticnet)
mae_elasticnet = mean_absolute_error(y_test, y_pred_elasticnet)
r2_elasticnet = r2_score(y_test, y_pred_elasticnet)

y_pred_random_lasso = best_model_random_lasso.predict(X_test)
mse_random_lasso = mean_squared_error(y_test, y_pred_random_lasso)
mae_random_lasso = mean_absolute_error(y_test, y_pred_random_lasso)
r2_random_lasso = r2_score(y_test, y_pred_random_lasso)

y_pred_random_ridge = best_model_random_ridge.predict(X_test)
mse_random_ridge = mean_squared_error(y_test, y_pred_random_ridge)
mae_random_ridge = mean_absolute_error(y_test, y_pred_random_ridge)
r2_random_ridge = r2_score(y_test, y_pred_random_ridge)

y_pred_random_elasticnet = best_model_random_elasticnet.predict(X_test)
mse_random_elasticnet = mean_squared_error(y_test, y_pred_random_elasticnet)
mae_random_elasticnet = mean_absolute_error(y_test, y_pred_random_elasticnet)
r2_random_elasticnet = r2_score(y_test, y_pred_random_elasticnet)

In [143]:
# Print the results
print("Results for best LinearRegression model:")
print("MSE: {:.4f}".format(mean_squared_error(y_test, y_pred_linear)))
print("MAE: {:.4f}".format(mean_absolute_error(y_test, y_pred_linear)))
print("R^2: {:.4f}".format(r2_score(y_test, y_pred_linear)))
print()

print("Results for best Lasso model (GridSearchCV):")
print("MSE: {:.4f}".format(mse_lasso))
print("MAE: {:.4f}".format(mae_lasso))
print("R^2: {:.4f}".format(r2_lasso))
print("Best parameters: {}".format(best_params_lasso))
print()

print("Results for best Ridge model (GridSearchCV):")
print("MSE: {:.4f}".format(mse_ridge))
print("MAE: {:.4f}".format(mae_ridge))
print("R^2: {:.4f}".format(r2_ridge))
print("Best parameters: {}".format(best_params_ridge))
print()

print("Results for best ElasticNet model (GridSearchCV):")
print("MSE: {:.4f}".format(mse_elasticnet))
print("MAE: {:.4f}".format(mae_elasticnet))
print("R^2: {:.4f}".format(r2_elasticnet))
print("Best parameters: {}".format(best_params_elasticnet))
print()

print("Results for best Lasso model (RandomizedSearchCV):")
print("MSE: {:.4f}".format(mse_random_lasso))
print("MAE: {:.4f}".format(mae_random_lasso))
print("R^2: {:.4f}".format(r2_random_lasso))
print("Best parameters: {}".format(best_params_random_lasso))
print()

print("Results for best Ridge model (RandomizedSearchCV):")
print("MSE: {:.4f}".format(mse_random_ridge))
print("MAE: {:.4f}".format(mae_random_ridge))
print("R^2: {:.4f}".format(r2_random_ridge))
print("Best parameters: {}".format(best_params_random_ridge))
print()

print("Results for best ElasticNet model (RandomizedSearchCV):")
print("MSE: {:.4f}".format(mse_random_elasticnet))
print("MAE: {:.4f}".format(mae_random_elasticnet))
print("R^2: {:.4f}".format(r2_random_elasticnet))
print("Best parameters: {}".format(best_params_random_elasticnet))
print()

Results for best LinearRegression model:
MSE: 0.1174
MAE: 0.2694
R^2: 0.5283

Results for best Lasso model (GridSearchCV):
MSE: 0.1445
MAE: 0.3541
R^2: 0.4198
Best parameters: {'lasso__alpha': 0.1, 'lasso__max_iter': 100}

Results for best Ridge model (GridSearchCV):
MSE: 0.1174
MAE: 0.2695
R^2: 0.5283
Best parameters: {'ridge__alpha': 1, 'ridge__max_iter': 100}

Results for best ElasticNet model (GridSearchCV):
MSE: 0.1201
MAE: 0.2855
R^2: 0.5178
Best parameters: {'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.1, 'elasticnet__max_iter': 100}

Results for best Lasso model (RandomizedSearchCV):
MSE: 0.1927
MAE: 0.4331
R^2: 0.2261
Best parameters: {'lasso__alpha': np.float64(0.2)}

Results for best Ridge model (RandomizedSearchCV):
MSE: 0.1175
MAE: 0.2695
R^2: 0.5283
Best parameters: {'ridge__max_iter': 10, 'ridge__alpha': np.float64(2.15)}

Results for best ElasticNet model (RandomizedSearchCV):
MSE: 0.2474
MAE: 0.4974
R^2: 0.0062
Best parameters: {'elasticnet__max_iter': 1000, 'el