#### Adult Income Dataset

#### Cross-Validation to Random Forest Classifier

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

adult = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_15/adult.csv')

adult['income'] = adult['income'].map({'<=50K': 0, '>50K': 1})
adult['workclass'] = adult['workclass'].replace(' ?', adult['workclass'].mode()[0])
adult['occupation'] = adult['occupation'].replace(' ?', adult['occupation'].mode()[0])
adult['native-country'] = adult['native-country'].replace(' ?', adult['native-country'].mode()[0])

categorical_cols = adult.select_dtypes(include=['object']).columns
adult = pd.get_dummies(adult, columns=categorical_cols, drop_first=True)

X = adult.drop(columns=['income'])
y = adult['income']

model = RandomForestClassifier(random_state=42)
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores: {cross_val_scores}")
print(f"Mean Accuracy: {np.mean(cross_val_scores):.4f}")
print(f"Standard Deviation: {np.std(cross_val_scores):.4f}")

#### Overfitting and Underfitting in Gradient Boosting Machines

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

adult = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_15/adult.csv')
adult['income'] = adult['income'].map({'<=50K': 0, '>50K': 1})
adult['workclass'] = adult['workclass'].replace(' ?', adult['workclass'].mode()[0])
adult['occupation'] = adult['occupation'].replace(' ?', adult['occupation'].mode()[0])
adult['native-country'] = adult['native-country'].replace(' ?', adult['native-country'].mode()[0])

X = adult.drop(columns=['income'])
y = adult['income']

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

X = pd.get_dummies(X, columns=categorical_cols)
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

for n_estimators in [50, 100, 200]:
    for learning_rate in [0.01, 0.1, 0.2]:
        model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        val_accuracy = accuracy_score(y_val, y_val_pred)
        print(f"n_estimators={n_estimators}, learning_rate={learning_rate}, Training Accuracy: {train_accuracy:.4f}, Validation Accuracy: {val_accuracy:.4f}")


n_estimators=50, learning_rate=0.01, Training Accuracy: 0.8036, Validation Accuracy: 0.8108
n_estimators=50, learning_rate=0.1, Training Accuracy: 0.8609, Validation Accuracy: 0.8663
n_estimators=50, learning_rate=0.2, Training Accuracy: 0.8686, Validation Accuracy: 0.8735
n_estimators=100, learning_rate=0.01, Training Accuracy: 0.8443, Validation Accuracy: 0.8511
n_estimators=100, learning_rate=0.1, Training Accuracy: 0.8680, Validation Accuracy: 0.8741
n_estimators=100, learning_rate=0.2, Training Accuracy: 0.8752, Validation Accuracy: 0.8777
n_estimators=200, learning_rate=0.01, Training Accuracy: 0.8499, Validation Accuracy: 0.8547
n_estimators=200, learning_rate=0.1, Training Accuracy: 0.8748, Validation Accuracy: 0.8770
n_estimators=200, learning_rate=0.2, Training Accuracy: 0.8821, Validation Accuracy: 0.8781


#### Precision, Recall, and F1-Score for Random Forests 

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,roc_auc_score

adult = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_15/adult.csv')
adult['income'] = adult['income'].map({'<=50K': 0, '>50K': 1})
adult['workclass'] = adult['workclass'].replace(' ?', adult['workclass'].mode()[0])
adult['occupation'] = adult['occupation'].replace(' ?', adult['occupation'].mode()[0])
adult['native-country'] = adult['native-country'].replace(' ?', adult['native-country'].mode()[0])

categorical_cols = adult.select_dtypes(include=['object']).columns
adult = pd.get_dummies(adult, columns=categorical_cols, drop_first=True)

X = adult.drop(columns=['income'])
y = adult['income']

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

X = pd.get_dummies(X, columns=categorical_cols)
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)

y_pred = model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
precision= precision_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
roc = roc_auc_score(y_val,y_pred)

print(f"F1 : {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC Score : {roc:.4f}")

F1 : 0.6793
Precision: 0.7297
Recall: 0.6354
Accuracy: 0.8594
ROC Score : 0.7817


####  ROC Curve and AUC for Gradient Boosting Classifier

In [None]:
import pandas as pd
import  matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve

adult = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_15/adult.csv')
adult['income'] = adult['income'].map({'<=50K': 0, '>50K': 1})
adult['workclass'] = adult['workclass'].replace(' ?', adult['workclass'].mode()[0])
adult['occupation'] = adult['occupation'].replace(' ?', adult['occupation'].mode()[0])
adult['native-country'] = adult['native-country'].replace(' ?', adult['native-country'].mode()[0])

X = adult.drop(columns=['income'])
y = adult['income']

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

X = pd.get_dummies(X, columns=categorical_cols)
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:,1]

roc = roc_auc_score(y_val, y_pred)
fpr, tpr, _ = roc_curve(y_val, y_proba, pos_label='Yes')

print(f"ROC Score  : {roc:.4f}")

plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Receiver Operating Characteristic')
plt.show()


#### 	Model Performance Comparison with Different Metrics

In [13]:
# Cross validation of Gradient Boosting Machines 
import pandas as pd
import  matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
import numpy as np
from sklearn.model_selection import cross_val_score

adult = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_15/adult.csv')
adult['income'] = adult['income'].map({'<=50K': 0, '>50K': 1})
adult['workclass'] = adult['workclass'].replace(' ?', adult['workclass'].mode()[0])
adult['occupation'] = adult['occupation'].replace(' ?', adult['occupation'].mode()[0])
adult['native-country'] = adult['native-country'].replace(' ?', adult['native-country'].mode()[0])

X = adult.drop(columns=['income'])
y = adult['income']

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

X = pd.get_dummies(X, columns=categorical_cols)
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:,1]

cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores: {cross_val_scores.sum()}")
print(f"Mean Accuracy: {np.mean(cross_val_scores):.4f}")
print(f"Standard Deviation: {np.std(cross_val_scores):.4f}")

Cross-Validation Accuracy Scores: 4.3383767926070105
Mean Accuracy: 0.8677
Standard Deviation: 0.0022


In [12]:
# Cross Validation of Random Forest Classifier 
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

adult = pd.read_csv('C:/Users/abdul/OneDrive/Desktop/ByteWise_ML/Day_15/adult.csv')

adult['income'] = adult['income'].map({'<=50K': 0, '>50K': 1})
adult['workclass'] = adult['workclass'].replace(' ?', adult['workclass'].mode()[0])
adult['occupation'] = adult['occupation'].replace(' ?', adult['occupation'].mode()[0])
adult['native-country'] = adult['native-country'].replace(' ?', adult['native-country'].mode()[0])

categorical_cols = adult.select_dtypes(include=['object']).columns
adult = pd.get_dummies(adult, columns=categorical_cols, drop_first=True)

X = adult.drop(columns=['income'])
y = adult['income']

model = RandomForestClassifier(random_state=42)
cross_val_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores: {cross_val_scores.sum()}")
print(f"Mean Accuracy: {np.mean(cross_val_scores):.4f}")
print(f"Standard Deviation: {np.std(cross_val_scores):.4f}")

Cross-Validation Accuracy Scores: 4.269993347137886
Mean Accuracy: 0.8540
Standard Deviation: 0.0017


#### The Cross validation Score is higher for Gradient Boosting Machines 