In [1]:
import pandas as pd


# Loading datasets
higher_ed_df = pd.read_csv('dataset/subset_higher_ed_ednum.csv')
some_college_df = pd.read_csv('dataset/subset_some_college_ednum.csv')
less_than_hs_df = pd.read_csv('dataset/subset_less_than_hs_ednum.csv')

from sklearn.preprocessing import LabelEncoder


# Dropping 'fnlgwt' and 'Education' features from all datasets as they are not useful or duplicated
higher_ed_df = higher_ed_df.drop(['fnlgwt', 'Education'], axis=1)
some_college_df = some_college_df.drop(['fnlgwt', 'Education'], axis=1)
less_than_hs_df = less_than_hs_df.drop(['fnlgwt', 'Education'], axis=1)

# Encoding categorical features in all datasets
def encode_features(df):
    label_encoders = {}
    for column in df.select_dtypes(include=['object']).columns:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])
    return df, label_encoders

higher_ed_df_encoded, higher_ed_encoders = encode_features(higher_ed_df.copy())
some_college_df_encoded, some_college_encoders = encode_features(some_college_df.copy())
less_than_hs_df_encoded, less_than_hs_encoders = encode_features(less_than_hs_df.copy())

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as plt


# Defining the input features and target variable for each subset
X_higher_ed = higher_ed_df_encoded.drop('Income', axis=1)
y_higher_ed = higher_ed_df_encoded['Income']

X_some_college = some_college_df_encoded.drop('Income', axis=1)
y_some_college = some_college_df_encoded['Income']

X_less_than_hs = less_than_hs_df_encoded.drop('Income', axis=1)
y_less_than_hs = less_than_hs_df_encoded['Income']

from sklearn.model_selection import train_test_split

X_higher_ed_train, X_higher_ed_test, y_higher_ed_train, y_higher_ed_test = train_test_split(X_higher_ed, y_higher_ed, test_size=0.1, random_state=42)

X_some_college_train, X_some_college_test, y_some_college_train, y_some_college_test = train_test_split(X_some_college, y_some_college, test_size=0.1, random_state=42)

X_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_train, y_less_than_hs_test = train_test_split(X_less_than_hs, y_less_than_hs, test_size=0.1, random_state=42)

1. K-Nearest Neighbors (KNN)

In [2]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

def evaluate_knn(X_train, y_train, X_test, y_test, k, title):
    # 标准化训练和测试数据
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 使用最佳的 k 值训练模型
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train_scaled, y_train)
    
    # 在测试集上进行预测
    y_pred = knn_clf.predict(X_test_scaled)
    
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_knn(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 9, "Higher Education Subset")
evaluate_knn(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 9, "Some College Subset")
evaluate_knn(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 9, "Less than High School Subset")


Accuracy for Higher Education Subset: 0.7688
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.74      0.76       569
           1       0.75      0.80      0.78       573

    accuracy                           0.77      1142
   macro avg       0.77      0.77      0.77      1142
weighted avg       0.77      0.77      0.77      1142
Accuracy for Some College Subset: 0.8416
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      2306
           1       0.58      0.45      0.51       509

    accuracy                           0.84      2815
   macro avg       0.73      0.69      0.71      2815
weighted avg       0.83      0.84      0.83      2815

Accuracy for Less than High School Subset: 0.9295
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       527
           1       0.50      0.05

2. Artificial Neural Network (ANN)

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

def evaluate_ann(X_train, y_train, X_test, y_test, hidden_layer_size, title):
    # 标准化训练和测试数据
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 使用最佳的隐藏层配置训练模型
    ann_clf = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation='relu', max_iter=1000)
    ann_clf.fit(X_train_scaled, y_train)
    
    # 在测试集上进行预测
    y_pred = ann_clf.predict(X_test_scaled)
    
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_ann(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, (50,), "Higher Education Subset")
evaluate_ann(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, (50,), "Some College Subset")
evaluate_ann(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, (50,), "Less than High School Subset")


Accuracy for Higher Education Subset: 0.8117
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.76      0.80       569
           1       0.78      0.87      0.82       573

    accuracy                           0.81      1142
   macro avg       0.82      0.81      0.81      1142
weighted avg       0.82      0.81      0.81      1142
Accuracy for Some College Subset: 0.8547
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92      2306
           1       0.70      0.34      0.46       509

    accuracy                           0.85      2815
   macro avg       0.78      0.66      0.69      2815
weighted avg       0.84      0.85      0.83      2815
Accuracy for Less than High School Subset: 0.9277
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       527
           1       0.45      0.12 

3. Decision Tree

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

def evaluate_decision_tree(X_train, y_train, X_test, y_test, max_depth, title):
    # 使用最佳的最大深度配置训练模型
    dt_clf = DecisionTreeClassifier(max_depth=max_depth)
    dt_clf.fit(X_train, y_train)
    
    # 在测试集上进行预测
    y_pred = dt_clf.predict(X_test)
    
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_decision_tree(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 7, "Higher Education Subset")
evaluate_decision_tree(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 5, "Some College Subset")
evaluate_decision_tree(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 3, "Less than High School Subset")


Accuracy for Higher Education Subset: 0.8170
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.77      0.81       569
           1       0.79      0.86      0.83       573

    accuracy                           0.82      1142
   macro avg       0.82      0.82      0.82      1142
weighted avg       0.82      0.82      0.82      1142

Accuracy for Some College Subset: 0.8529
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      2306
           1       0.81      0.25      0.38       509

    accuracy                           0.85      2815
   macro avg       0.83      0.62      0.65      2815
weighted avg       0.85      0.85      0.82      2815

Accuracy for Less than High School Subset: 0.9312
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       527
           1       1.00      0.0

4. Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def evaluate_random_forest(X_train, y_train, X_test, y_test, n_estimators, title):
    # 使用最佳的树的数量配置训练模型
    rf_clf = RandomForestClassifier(n_estimators=n_estimators)
    rf_clf.fit(X_train, y_train)
    
    # 在测试集上进行预测
    y_pred = rf_clf.predict(X_test)
    
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_random_forest(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 100, "Higher Education Subset")
evaluate_random_forest(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 200, "Some College Subset")
evaluate_random_forest(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 150, "Less than High School Subset")


Accuracy for Higher Education Subset: 0.7977
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       569
           1       0.81      0.78      0.80       573

    accuracy                           0.80      1142
   macro avg       0.80      0.80      0.80      1142
weighted avg       0.80      0.80      0.80      1142
Accuracy for Some College Subset: 0.8561
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      2306
           1       0.64      0.48      0.55       509

    accuracy                           0.86      2815
   macro avg       0.76      0.71      0.73      2815
weighted avg       0.84      0.86      0.85      2815
Accuracy for Less than High School Subset: 0.9330
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       527
           1       0.62      0.12 

5. Naïve Bayes Classifier

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

def evaluate_naive_bayes(X_train, y_train, X_test, y_test, var_smoothing, title):
    # 标准化训练和测试数据
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 使用最佳的var_smoothing值配置训练模型
    nb_clf = GaussianNB(var_smoothing=var_smoothing)
    nb_clf.fit(X_train_scaled, y_train)
    
    # 在测试集上进行预测
    y_pred = nb_clf.predict(X_test_scaled)
    
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_naive_bayes(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 1e-5, "Higher Education Subset")
evaluate_naive_bayes(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 1e-5, "Some College Subset")
evaluate_naive_bayes(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 1e-5, "Less than High School Subset")


Accuracy for Higher Education Subset: 0.6708
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.89      0.73       569
           1       0.80      0.45      0.58       573

    accuracy                           0.67      1142
   macro avg       0.71      0.67      0.65      1142
weighted avg       0.71      0.67      0.65      1142

Accuracy for Some College Subset: 0.8266
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      2306
           1       0.54      0.29      0.38       509

    accuracy                           0.83      2815
   macro avg       0.70      0.62      0.64      2815
weighted avg       0.80      0.83      0.80      2815

Accuracy for Less than High School Subset: 0.9083
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       527
           1       0.31      0.2

6. Support Vector Machine (SVM)

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

def evaluate_svm(X_train, y_train, X_test, y_test, kernel, title):
    # 标准化训练和测试数据
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 使用最佳的核函数配置训练模型
    svm_clf = SVC(kernel=kernel)
    svm_clf.fit(X_train_scaled, y_train)
    
    # 在测试集上进行预测
    y_pred = svm_clf.predict(X_test_scaled)
    
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_svm(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 'rbf', "Higher Education Subset")
evaluate_svm(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 'rbf', "Some College Subset")
evaluate_svm(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 'rbf', "Less than High School Subset")


Accuracy for Higher Education Subset: 0.8074
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.73      0.79       569
           1       0.77      0.89      0.82       573

    accuracy                           0.81      1142
   macro avg       0.82      0.81      0.81      1142
weighted avg       0.82      0.81      0.81      1142
Accuracy for Some College Subset: 0.8515
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92      2306
           1       0.79      0.24      0.37       509

    accuracy                           0.85      2815
   macro avg       0.82      0.61      0.64      2815
weighted avg       0.84      0.85      0.82      2815
Accuracy for Less than High School Subset: 0.9295
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       527
           1       0.50      0.03 