In [26]:
import pandas as pd
import time

# Loading datasets
higher_ed_df = pd.read_csv('dataset/subset_higher_ed_ednum.csv')
some_college_df = pd.read_csv('dataset/subset_some_college_ednum.csv')
less_than_hs_df = pd.read_csv('dataset/subset_less_than_hs_ednum.csv')

from sklearn.preprocessing import LabelEncoder


# Dropping 'fnlgwt' and 'Education' features from all datasets as they are not useful or duplicated
higher_ed_df = higher_ed_df.drop(['fnlgwt', 'Education'], axis=1)
some_college_df = some_college_df.drop(['fnlgwt', 'Education'], axis=1)
less_than_hs_df = less_than_hs_df.drop(['fnlgwt', 'Education'], axis=1)

# Encoding categorical features in all datasets
def encode_features(df):
    label_encoders = {}
    for column in df.select_dtypes(include=['object']).columns:
        label_encoders[column] = LabelEncoder()
        df[column] = label_encoders[column].fit_transform(df[column])
    return df, label_encoders

higher_ed_df_encoded, higher_ed_encoders = encode_features(higher_ed_df.copy())
some_college_df_encoded, some_college_encoders = encode_features(some_college_df.copy())
less_than_hs_df_encoded, less_than_hs_encoders = encode_features(less_than_hs_df.copy())

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as plt


# Defining the input features and target variable for each subset
X_higher_ed = higher_ed_df_encoded.drop('Income', axis=1)
y_higher_ed = higher_ed_df_encoded['Income']

X_some_college = some_college_df_encoded.drop('Income', axis=1)
y_some_college = some_college_df_encoded['Income']

X_less_than_hs = less_than_hs_df_encoded.drop('Income', axis=1)
y_less_than_hs = less_than_hs_df_encoded['Income']

from sklearn.model_selection import train_test_split

X_higher_ed_train, X_higher_ed_test, y_higher_ed_train, y_higher_ed_test = train_test_split(X_higher_ed, y_higher_ed, test_size=0.1, random_state=42)

X_some_college_train, X_some_college_test, y_some_college_train, y_some_college_test = train_test_split(X_some_college, y_some_college, test_size=0.1, random_state=42)

X_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_train, y_less_than_hs_test = train_test_split(X_less_than_hs, y_less_than_hs, test_size=0.1, random_state=42)

In [27]:
import pandas as pd

# 假设X_higher_ed, y_higher_ed等是pandas的DataFrame或Series，以下代码将它们分别保存到CSV文件中

# 存储“Higher Education”数据data/X_higher_ed_train.csv', index=False)
X_higher_ed_train.to_csv('data/X_higher_ed_train.csv', index=False)
X_higher_ed_test.to_csv('data/X_higher_ed_test.csv', index=False)
y_higher_ed_train.to_csv('data/y_higher_ed_train.csv', index=False)
y_higher_ed_test.to_csv('data/y_higher_ed_test.csv', index=False)

# 存储“Some College”数据集
X_some_college_train.to_csv('data/X_some_college_train.csv', index=False)
X_some_college_test.to_csv('data/X_some_college_test.csv', index=False)
y_some_college_train.to_csv('data/y_some_college_train.csv', index=False)
y_some_college_test.to_csv('data/y_some_college_test.csv', index=False)

# 存储“Less than High School”数据集
X_less_than_hs_train.to_csv('data/X_less_than_hs_train.csv', index=False)
X_less_than_hs_test.to_csv('data/X_less_than_hs_test.csv', index=False)
y_less_than_hs_train.to_csv('data/y_less_than_hs_train.csv', index=False)
y_less_than_hs_test.to_csv('data/y_less_than_hs_test.csv', index=False)


1. K-Nearest Neighbors (KNN)

In [28]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

def evaluate_knn(X_train, y_train, X_test, y_test, k, title):
    print(title)
    # 标准化训练和测试数据
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 使用最佳的 k 值训练模型
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    start_time=time.time()
    knn_clf.fit(X_train_scaled, y_train)
    y_pred = knn_clf.predict(X_train_scaled)
    end_time = time.time()
    
    accuracy = accuracy_score(y_train, y_pred)
    print(f'Train Accuracy for {title}: {accuracy:.4f}, Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_train, y_pred))
    
    # 在测试集上进行预测
    start_time=time.time()
    y_pred = knn_clf.predict(X_test_scaled)
    end_time = time.time()
    
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Test Accuracy for {title}: {accuracy:.4f}, Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_knn(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 9, "Higher Education Subset")
evaluate_knn(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 9, "Some College Subset")
evaluate_knn(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 9, "Less than High School Subset")


Higher Education Subset
Train Accuracy for Higher Education Subset: 0.8127, Time is 1.5584
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81      5282
           1       0.79      0.83      0.81      4989

    accuracy                           0.81     10271
   macro avg       0.81      0.81      0.81     10271
weighted avg       0.81      0.81      0.81     10271

Test Accuracy for Higher Education Subset: 0.7688, Time is 0.1570
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.74      0.76       569
           1       0.75      0.80      0.78       573

    accuracy                           0.77      1142
   macro avg       0.77      0.77      0.77      1142
weighted avg       0.77      0.77      0.77      1142

Some College Subset
Train Accuracy for Some College Subset: 0.8681, Time is 7.4597
Classification Report:
              precision    recall  f1

2. Artificial Neural Network (ANN)

In [29]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

def evaluate_ann(X_train, y_train, X_test, y_test, hidden_layer_size, title):
    print(title)
    # 标准化训练和测试数据
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 使用最佳的隐藏层配置训练模型
    start_time=time.time()
    ann_clf = MLPClassifier(hidden_layer_sizes=hidden_layer_size, activation='relu', max_iter=1000)
    ann_clf.fit(X_train_scaled, y_train)
    y_pred = ann_clf.predict(X_train_scaled)
    end_time = time.time()
    
    accuracy = accuracy_score(y_train, y_pred)
    print(f'Train Accuracy for {title}: {accuracy:.4f}, Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_train, y_pred))
    
    # 在测试集上进行预测
    start_time = time.time()
    y_pred = ann_clf.predict(X_test_scaled)
    end_time = time.time()
    
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f},Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_ann(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, (50,), "Higher Education Subset")
evaluate_ann(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, (50,), "Some College Subset")
evaluate_ann(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, (50,), "Less than High School Subset")


Higher Education Subset
Train Accuracy for Higher Education Subset: 0.8158, Time is 6.9259
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.80      0.82      5282
           1       0.80      0.84      0.82      4989

    accuracy                           0.82     10271
   macro avg       0.82      0.82      0.82     10271
weighted avg       0.82      0.82      0.82     10271

Accuracy for Higher Education Subset: 0.8039,Time is 0.0000
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.76      0.79       569
           1       0.78      0.84      0.81       573

    accuracy                           0.80      1142
   macro avg       0.81      0.80      0.80      1142
weighted avg       0.81      0.80      0.80      1142

Some College Subset
Train Accuracy for Some College Subset: 0.8568, Time is 14.6042
Classification Report:
              precision    recall  f1-scor

3. Decision Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

def evaluate_decision_tree(X_train, y_train, X_test, y_test, max_depth, title):
    # 使用最佳的最大深度配置训练模型
    print(title)
    start_time=time.time()
    dt_clf = DecisionTreeClassifier(max_depth=max_depth)
    dt_clf.fit(X_train, y_train)
    y_pred = dt_clf.predict(X_train)
    end_time = time.time()
    
    accuracy = accuracy_score(y_train, y_pred)
    print(f'Train Accuracy for {title}: {accuracy:.4f}, Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_train, y_pred))
    
    # 在测试集上进行预测
    start_time = time.time()
    y_pred = dt_clf.predict(X_test)
    end_time = time.time()
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f},Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_decision_tree(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 7, "Higher Education Subset")
evaluate_decision_tree(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 5, "Some College Subset")
evaluate_decision_tree(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 3, "Less than High School Subset")


Higher Education Subset
Train Accuracy for Higher Education Subset: 0.8144, Time is 0.0175
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.78      0.81      5282
           1       0.79      0.85      0.82      4989

    accuracy                           0.81     10271
   macro avg       0.82      0.82      0.81     10271
weighted avg       0.82      0.81      0.81     10271

Accuracy for Higher Education Subset: 0.8170,Time is 0.0010
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.77      0.81       569
           1       0.79      0.86      0.83       573

    accuracy                           0.82      1142
   macro avg       0.82      0.82      0.82      1142
weighted avg       0.82      0.82      0.82      1142

Some College Subset
Train Accuracy for Some College Subset: 0.8522, Time is 0.0326
Classification Report:
              precision    recall  f1-score

4. Random Forest

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

def evaluate_random_forest(X_train, y_train, X_test, y_test, n_estimators, title):
    # 使用最佳的树的数量配置训练模型
    print(title)
    start_time=time.time()
    rf_clf = RandomForestClassifier(n_estimators=n_estimators)
    rf_clf.fit(X_train, y_train)
    y_pred = rf_clf.predict(X_train)
    end_time = time.time()
    
    accuracy = accuracy_score(y_train, y_pred)
    print(f'Train Accuracy for {title}: {accuracy:.4f}, Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_train, y_pred))
    
    # 在测试集上进行预测
    start_time = time.time()
    y_pred = rf_clf.predict(X_test)
    end_time = time.time()
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f},Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_random_forest(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 100, "Higher Education Subset")
evaluate_random_forest(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 200, "Some College Subset")
evaluate_random_forest(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 150, "Less than High School Subset")


Higher Education Subset
Train Accuracy for Higher Education Subset: 0.9723, Time is 1.4432
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      5282
           1       0.96      0.98      0.97      4989

    accuracy                           0.97     10271
   macro avg       0.97      0.97      0.97     10271
weighted avg       0.97      0.97      0.97     10271

Accuracy for Higher Education Subset: 0.8047,Time is 0.0390
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       569
           1       0.82      0.79      0.80       573

    accuracy                           0.80      1142
   macro avg       0.81      0.80      0.80      1142
weighted avg       0.81      0.80      0.80      1142

Some College Subset
Train Accuracy for Some College Subset: 0.9653, Time is 5.9935
Classification Report:
              precision    recall  f1-score

5. Naïve Bayes Classifier

In [32]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

def evaluate_naive_bayes(X_train, y_train, X_test, y_test, var_smoothing, title):
    # 标准化训练和测试数据
    print(title)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 使用最佳的var_smoothing值配置训练模型
    start_time=time.time()
    nb_clf = GaussianNB(var_smoothing=var_smoothing)
    nb_clf.fit(X_train_scaled, y_train)
    y_pred = nb_clf.predict(X_train_scaled)
    end_time = time.time()
    
    accuracy = accuracy_score(y_train, y_pred)
    print(f'Train Accuracy for {title}: {accuracy:.4f}, Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_train, y_pred))
    
    # 在测试集上进行预测
    start_time = time.time()
    y_pred = nb_clf.predict(X_test_scaled)
    end_time = time.time()
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f},Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_naive_bayes(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 1e-5, "Higher Education Subset")
evaluate_naive_bayes(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 1e-5, "Some College Subset")
evaluate_naive_bayes(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 1e-5, "Less than High School Subset")


Higher Education Subset
Train Accuracy for Higher Education Subset: 0.6761, Time is 0.0033
Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.92      0.75      5282
           1       0.84      0.41      0.55      4989

    accuracy                           0.68     10271
   macro avg       0.73      0.67      0.65     10271
weighted avg       0.73      0.68      0.65     10271

Accuracy for Higher Education Subset: 0.6708,Time is 0.0000
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.89      0.73       569
           1       0.80      0.45      0.58       573

    accuracy                           0.67      1142
   macro avg       0.71      0.67      0.65      1142
weighted avg       0.71      0.67      0.65      1142

Some College Subset
Train Accuracy for Some College Subset: 0.8219, Time is 0.0110
Classification Report:
              precision    recall  f1-score

6. Support Vector Machine (SVM)

In [33]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

def evaluate_svm(X_train, y_train, X_test, y_test, kernel, title):
    # 标准化训练和测试数据
    print(title)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 使用最佳的核函数配置训练模型
    start_time=time.time()
    svm_clf = SVC(kernel=kernel)
    svm_clf.fit(X_train_scaled, y_train)
    y_pred = svm_clf.predict(X_train_scaled)
    end_time = time.time()
    
    accuracy = accuracy_score(y_train, y_pred)
    print(f'Train Accuracy for {title}: {accuracy:.4f}, Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_train, y_pred))
    
    # 在测试集上进行预测
    start_time=time.time()
    y_pred = svm_clf.predict(X_test_scaled)
    end_time = time.time()
    # 计算并打印性能指标
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy for {title}: {accuracy:.4f},Time is {end_time-start_time:.4f}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

# 对每个子集调用评估函数
evaluate_svm(X_higher_ed_train, y_higher_ed_train, X_higher_ed_test, y_higher_ed_test, 'rbf', "Higher Education Subset")
evaluate_svm(X_some_college_train, y_some_college_train, X_some_college_test, y_some_college_test, 'rbf', "Some College Subset")
evaluate_svm(X_less_than_hs_train, y_less_than_hs_train, X_less_than_hs_test, y_less_than_hs_test, 'rbf', "Less than High School Subset")


Higher Education Subset
Train Accuracy for Higher Education Subset: 0.8063, Time is 8.6322
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.74      0.80      5282
           1       0.76      0.87      0.81      4989

    accuracy                           0.81     10271
   macro avg       0.81      0.81      0.81     10271
weighted avg       0.81      0.81      0.81     10271
Accuracy for Higher Education Subset: 0.8074,Time is 0.6046
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.73      0.79       569
           1       0.77      0.89      0.82       573

    accuracy                           0.81      1142
   macro avg       0.82      0.81      0.81      1142
weighted avg       0.82      0.81      0.81      1142

Some College Subset
Train Accuracy for Some College Subset: 0.8486, Time is 38.6056
Classification Report:
              precision    recall  f1-score