In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# 1. 读取数据
data = pd.read_csv('income_classification.csv')

# 2. 显示数据的维度和前5行数据
print("数据维度:", data.shape)
print("前5行数据:\n", data.head())

# 3. 对连续变量年龄进行离散化
age_bins = [20, 30, 40, 50, 60, 70]
data['age'] = pd.cut(data['age'], bins=age_bins, labels=False)
print("离散化后的前5行数据:\n", data.head())

# 4. 对字符串特征进行数字编号处理
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le
print("编码后的前5行数据:\n", data.head())

# 5. 选择特征和标签
features = ['age', 'workclass', 'fnlwgt','education', 'education-num', 'marital-status', 
            'occupation', 'relationship', 'race', 'sex', 'capital-gain', 
            'capital-loss', 'hours-per-week', 'native-country']
X = data[features]
y = data['income']

# 6. 按7:3划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 7. 使用决策树算法进行分类
clf_tree = DecisionTreeClassifier(random_state=42)
clf_tree.fit(X_train, y_train)
y_pred_tree = clf_tree.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print("决策树分类准确率:", accuracy_tree)

# 8. 使用随机森林算法进行分类
clf_random = RandomForestClassifier(random_state=42)
clf_random.fit(X_train, y_train)
y_pred_random = clf_random.predict(X_test)
accuracy_random = accuracy_score(y_test, y_pred_random)
print("随机森林分类准确率:", accuracy_random)

# 9. 分析实验结果
if accuracy_tree > accuracy_random:
    print("决策树分类器的准确率更高。")
else:
    print("随机森林分类器的准确率更高。")


数据维度: (32561, 15)
前5行数据:
    age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0 