# K-Fold Cross Validation

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
titanic_data = pd.read_csv('titanic.csv')
data

In [3]:
titanic_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
# Fill missing ages with the mean age

age_imputer = SimpleImputer(strategy='mean')
titanic_data['Age'] = age_imputer.fit_transform(titanic_data[['Age']])
titanic_data['Age']


0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [6]:
# Convert 'Sex' and 'Embarked' to numerical values

label_encoder = LabelEncoder()
titanic_data['Sex'] = label_encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked'] = label_encoder.fit_transform(titanic_data['Embarked'])

In [7]:
# Fill missing embarked values with the most frequent port

embarked_imputer = SimpleImputer(strategy='most_frequent')
titanic_data['Embarked'] = embarked_imputer.fit_transform(titanic_data[['Embarked']])

In [8]:
# Drop 'Name,' 'Ticket,' and 'Cabin' columns

titanic_data = titanic_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [9]:
# Define features (X) and target (y)

X = titanic_data.drop('Survived', axis=1)  # Assuming 'Survived' is the target variable
y = titanic_data['Survived']


# Grid Search

- Titanic Dataset

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [12]:
titanic_data = pd.read_csv('titanic.csv')  # Replace 'path_to_titanic.csv' with the actual file path

In [13]:
# Drop 'Name,' 'Ticket,' and 'Cabin' columns
titanic_data = titanic_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)


In [14]:
# Handle missing values (e.g., fill missing ages with the mean age)
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)


In [15]:
# Encode categorical variables ('Sex' and 'Embarked')
label_encoder = LabelEncoder()
titanic_data['Sex'] = label_encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked'] = label_encoder.fit_transform(titanic_data['Embarked'])


In [16]:
# Define features (X) and target (y)
X = titanic_data.drop('Survived', axis=1)  # Assuming 'Survived' is the target variable
y = titanic_data['Survived']

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
# Create a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)


In [32]:
clf.fit(X_train,y_train)

In [33]:
from pprint import pprint
clf = DecisionTreeClassifier(random_state=42)
pprint(clf.get_params())

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 42,
 'splitter': 'best'}


### Hyperparameter grid to search

In [19]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [36]:
# Perform Grid Search with cross-validation (e.g., K=5)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [35]:
# Get the best hyperparameters
best_params = grid_search.best_params_
best_params

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'min_samples_split': 10}

In [22]:
# Train a Decision Tree classifier with the best hyperparameters
best_clf = DecisionTreeClassifier(random_state=42, **best_params)
best_clf.fit(X_train, y_train)


In [23]:
# Evaluate the model on the test set
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [24]:
# Print the best hyperparameters and model accuracy
print(f"Best Hyperparameters: {best_params}")
print(f"Model Accuracy on Test Data: {accuracy:.2f}")

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Model Accuracy on Test Data: 0.78


# Model Evaluation with K-Fold & Grid Search

- Breast Cancer Dataset (in-built dataset)

In [37]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [38]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [40]:
clf = LogisticRegression(random_state=42)

In [41]:
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')

In [42]:
clf.fit(X_train, y_train)


# Step 7 Make Prediction on the test bar

In [43]:
y_pred = clf.predict(X_test)


In [44]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [45]:
print("K-Fold Cross-Validation Results:")
for i, score in enumerate(cv_scores, 1):
    print(f"Fold {i}: {score:.2f}")

K-Fold Cross-Validation Results:
Fold 1: 0.98
Fold 2: 0.90
Fold 3: 0.97
Fold 4: 0.96
Fold 5: 0.90


In [46]:
print("\nTest Set Evaluation Metrics:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Test Set Evaluation Metrics:
Accuracy: 0.96
Precision: 0.95
Recall: 0.99
F1 Score: 0.97

Confusion Matrix:
[[39  4]
 [ 1 70]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.91      0.94        43
           1       0.95      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [47]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

### Step 11: Perform Grid Search for hyperparameter tuning

In [48]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

### Step 12: Get the best hyperparameters

In [49]:
best_params = grid_search.best_params_

### Step 13: Train a Logistic Regression classifier with the best hyperparameters

In [50]:
best_clf = LogisticRegression(random_state=42, **best_params)
best_clf.fit(X_train, y_train)

### Step 14: Make predictions with the tuned model

In [51]:
y_pred_tuned = best_clf.predict(X_test)


### Step 15: Evaluate the tuned model

In [52]:
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print("\nGrid Search Results:")
print(f"Best Hyperparameters: {best_params}")
print(f"Test Set Accuracy with Tuned Model: {accuracy_tuned:.2f}")


Grid Search Results:
Best Hyperparameters: {'C': 10}
Test Set Accuracy with Tuned Model: 0.96


### Compare result of k-fold & Grid Search

In [53]:
# Perform K-Fold Cross-Validation and store the accuracy scores
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')

# Calculate the accuracy of the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)

# Print and compare the results side by side
print("K-Fold Cross-Validation Results:")
print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")
print(f"Test Set Accuracy: {accuracy:.2f}")

print("\nGrid Search Results:")
print(f"Best Hyperparameters: {best_params}")
print(f"Test Set Accuracy with Tuned Model: {accuracy_tuned:.2f}")

K-Fold Cross-Validation Results:
Mean Accuracy: 0.94
Test Set Accuracy: 0.96

Grid Search Results:
Best Hyperparameters: {'C': 10}
Test Set Accuracy with Tuned Model: 0.96
