Title: Cross-Validation


Task 1: K-Fold Cross-Validation for House Prices<br>
Apply K-Fold Cross-Validation (K=5) to check variability in performance.

In [1]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

# Sample house price dataset
np.random.seed(42)
size = 1000
sqft = np.random.randint(500, 4000, size=size)
beds = np.random.randint(1, 6, size=size)
price = sqft * 200 + beds * 10000 + np.random.normal(0, 30000, size=size)
df = pd.DataFrame({'Sqft': sqft, 'Beds': beds, 'Price': price})

X = df[['Sqft', 'Beds']]
y = df['Price']

# Initialize model
model = LinearRegression()

# K-Fold Cross-Validation (K=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get RMSE for each fold
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)

# Output variability in performance
print(f"RMSE for each fold: {rmse_scores}")
print(f"Average RMSE: {rmse_scores.mean():.2f}")
print(f"Standard Deviation of RMSE: {rmse_scores.std():.2f}")


RMSE for each fold: [29648.76834006 30865.48245028 32038.15198684 29659.46707505
 28348.50076044]
Average RMSE: 30112.07
Standard Deviation of RMSE: 1249.57


Task 2: Stratified K-Fold for Imbalanced Churn Dataset<br>
Use Stratified K-Fold to ensure each class is represented.

In [2]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# Sample customer churn dataset (imbalanced)
np.random.seed(42)
size = 1000
tenure = np.random.randint(1, 72, size=size)
charges = np.random.uniform(20, 120, size=size)
total = tenure * charges + np.random.normal(0, 50, size=size)
churn = np.random.choice([0, 1], size=size, p=[0.7, 0.3])

df = pd.DataFrame({'Tenure': tenure, 'Charges': charges, 'Total': total, 'Churn': churn})

X = df[['Tenure', 'Charges', 'Total']]
y = df['Churn']

# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Random Forest model
model = RandomForestClassifier(random_state=42)

# Store accuracy scores
accuracy_scores = []

# Perform Stratified K-Fold Cross-Validation
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    accuracy_scores.append(acc)

# Output accuracy for each fold
print(f"Accuracy for each fold: {accuracy_scores}")
print(f"Average Accuracy: {np.mean(accuracy_scores):.2f}")
print(f"Standard Deviation of Accuracy: {np.std(accuracy_scores):.2f}")

Accuracy for each fold: [0.64, 0.585, 0.64, 0.695, 0.66]
Average Accuracy: 0.64
Standard Deviation of Accuracy: 0.04


Task 3: Leave-One-Out Cross-Validation for Iris<br>
Use LOOCV to assess model prediction for the Iris dataset.

In [3]:
from sklearn.model_selection import LeaveOneOut
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Initialize Leave-One-Out Cross-Validation
loocv = LeaveOneOut()

# Initialize Decision Tree model
model = DecisionTreeClassifier(random_state=42)

# Store accuracy scores
accuracy_scores = []

# Perform Leave-One-Out Cross-Validation
for train_index, test_index in loocv.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    accuracy_scores.append(acc)

# Output accuracy for each fold
print(f"Accuracy for each fold: {accuracy_scores}")
print(f"Average Accuracy: {np.mean(accuracy_scores):.2f}")
print(f"Standard Deviation of Accuracy: {np.std(accuracy_scores):.2f}")

Accuracy for each fold: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Average Accuracy: 0.94
Standard Deviation of Accuracy: 0.24
