### Titanic Dataset 

####  Cross-Validation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

titan = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_10/titanic.csv')
titan.head(4)
titan.shape
titan.isnull().sum()

titan['Age'].fillna(titan['Age'].mean(),inplace=True)
titan['Embarked'].fillna(titan['Embarked'].mode()[0],inplace=True)
titan.dropna(inplace=True)
print(titan.isnull().sum())

titan.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

data = pd.get_dummies(titan, columns=['Sex', 'Embarked'], drop_first=True)

X = data.drop(columns=['Survived'])
y = data['Survived']

scaler = StandardScaler()
X = scaler.fit_transform(X)

model = LogisticRegression()

# k=5 or k=10 is commonly used for k-fold cross-validation

scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
print(f"Cross-Validation Accuracy Scores:  \n {scores}")


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")

#### Overfitting & UnderFitting 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

data = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_10/titanic.csv')

data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

data.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)

X = data.drop(columns=['Survived'])
y = data['Survived']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_accuracies = []
test_accuracies = []

depth_range = range(1, 21)
for depth in depth_range:
    model = DecisionTreeClassifier(max_depth=depth, random_state=42)
    model.fit(X_train, y_train)
    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

print("Over fitting :",train_accuracies)
print("Under Fitting : ",test_accuracies)

plt.plot(depth_range, train_accuracies, label='Train Accuracy')
plt.plot(depth_range, test_accuracies, label='Test Accuracy')
plt.title('Decision Tree Accuracy vs. Depth')
plt.legend()
plt.show()


#### Precision, Recall, and F1-Score for Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score

data = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_10/titanic.csv')

data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

data.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)

X = data.drop(columns=['Survived'])
y = data['Survived']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision= precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"F1 : {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")

#### 	ROC Curve Analysis for Decision Trees

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score,roc_curve

data = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_10/titanic.csv')

data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

data.drop(columns=['Cabin', 'Name', 'Ticket'], inplace=True)

data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)

X = data.drop(columns=['Survived'])
y = data['Survived']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

roc = roc_auc_score(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_proba, pos_label='Yes')

print(f"ROC Score  : {roc:.4f}")

plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Receiver Operating Characteristic')
plt.show()



####  Model Performance with and without Cross-Validation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, make_scorer
import numpy as np

data = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_10/titanic.csv')

data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)
X = data.drop(columns=['Survived', 'Name', 'Ticket', 'Cabin'])
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_model = LogisticRegression(max_iter=1000)
tree_model = DecisionTreeClassifier(random_state=42)

logistic_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)

y_pred_logistic = logistic_model.predict(X_test)
y_pred_tree = tree_model.predict(X_test)

logistic_scores = cross_val_score(logistic_model, X, y, cv=5, scoring=make_scorer(accuracy_score))
tree_scores = cross_val_score(tree_model, X, y, cv=5, scoring=make_scorer(accuracy_score))

logistic_precision = cross_val_score(logistic_model, X, y, cv=5, scoring=make_scorer(precision_score))
tree_precision = cross_val_score(tree_model, X, y, cv=5, scoring=make_scorer(precision_score))

logistic_recall = cross_val_score(logistic_model, X, y, cv=5, scoring=make_scorer(recall_score))
tree_recall = cross_val_score(tree_model, X, y, cv=5, scoring=make_scorer(recall_score))

print(f"Logistic Regression - Accuracy: {np.mean(logistic_scores):.2f}")
print(f"Decision Tree - Accuracy: {np.mean(tree_scores):.2f}")

print(f"Logistic Regression - Precision: {np.mean(logistic_precision):.2f}")
print(f"Decision Tree - Precision: {np.mean(tree_precision):.2f}")

print(f"Logistic Regression - Recall: {np.mean(logistic_recall):.2f}")
print(f"Decision Tree - Recall: {np.mean(tree_recall):.2f}")
