In [1]:
# data manipulation
import pandas as pd
import numpy as np

# visualiation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision tree
from sklearn.ensemble import RandomForestClassifier # random forest
from sklearn.ensemble import GradientBoostingClassifier # gradient boosting
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn import svm
import category_encoders as ce
ori_demo = pd.read_csv("../data/train.csv")
ori_test_input = pd.read_csv("../data/test.csv")
# test_data = pd.read_csv("../data/test.csv")

In [2]:
# preparation to preprocess
print("shape of the dataframe:",ori_demo.shape)
category_features = []
continuous_features = []
for (column,dtype) in zip((ori_demo.drop(columns="exceeds50K")).columns, (ori_demo.drop(columns="exceeds50K")).dtypes):
    if dtype=="int64":
        continuous_features.append(column)
    else:
        category_features.append(column)
print("category feature:", category_features)
print("continuous feature:",continuous_features)

shape of the dataframe: (24421, 14)
category feature: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country']
continuous feature: ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [None]:
# missing values print
# deal ? as a new value
print('Before replacing missing values: ')
for column in ori_demo.columns:
    missing_values = pd.isna(ori_demo[column]).sum()
    print(column," has ",missing_values," missing values")
    
# deal ? as missing value    
for column in category_features:
    cates = ori_demo[column].unique().tolist()
    for cate in cates:
        if "?" in cate:
            print(column, " has missing values!")
            print(ori_demo[column].value_counts())
            break

In [None]:
# visualization
outcomes = ori_demo["exceeds50K"]
print(ori_demo["exceeds50K"].value_counts())
sb.countplot(x="exceeds50K",data=ori_demo).set_title("number of different outputs")
nums_of_cates = []
plt.figure(figsize=(10,3))
plt.title("unique values of category attributes")
for column in category_features:
    nums_of_cates.append(len(ori_demo[column].value_counts().tolist()))
plt.bar(category_features, nums_of_cates)
print(nums_of_cates)
mins_of_conts = []
maxs_of_conts = []
plt.figure(figsize=(10,5))
plt.title("scale of continuous attribtues")
for column in continuous_features:
    mins_of_conts.append(ori_demo[column].min())
    maxs_of_conts.append(ori_demo[column].max())
plt.bar(continuous_features,maxs_of_conts,bottom=mins_of_conts,edgecolor = 'white')
plt.yscale("log")
plt.show()
scales = []
for i in range(len(continuous_features)):
    scales.append(maxs_of_conts[i] - mins_of_conts[i])
print(scales)

In [None]:
# preprocess with missing values(replace ? using most common value)


# operation
ori_demo = ori_demo.fillna(ori_demo.mean())
# print('\nAfter replacing missing values: ')
# for column, mean_value in zip(ori_demo.columns, ori_demo.mean()):
#     missing_values = pd.isna(ori_demo[column]).sum()
for column in category_features:
    mean_value = ori_demo[column].value_counts().index[0]
#     print("mean", mean_value)
    ori_demo[column] = ori_demo[column].replace(regex=r'^.*\?.*$',value=mean_value)
#     print(column, ori_demo[column].unique())
def basic_preprocess(data):
    for column in category_features:
        mean_value = data[column].value_counts().index[0]
#         print("mean", mean_value)
        data[column] = data[column].replace(regex=r'^.*\?.*$',value=mean_value)
        print("column name:",column,", all values:", data[column].unique())
    return data
ori_test_input = basic_preprocess(ori_test_input)

In [5]:
# Class count and resampling
def over_sampling(train_data, target_label):
    count_0, count_1 = train_data[target_label].value_counts()
    # Divide by class
    train_data_0 = train_data[train_data[target_label] == 0]
    train_data_1 = train_data[train_data[target_label] == 1]

    train_data_1_over = train_data_1.sample(count_0, replace=True)
    train_data_over = pd.concat([train_data_0, train_data_1_over], axis=0)
    return train_data_over


def under_sampling(train_data, target_label):
    count_0, count_1 = train_data[target_label].value_counts()
    # Divide by class
    train_data_0 = train_data[train_data[target_label] == 0]
    train_data_1 = train_data[train_data[target_label] == 1]

    train_data_0_under = train_data_0.sample(count_1, replace=True)
    train_data_under = pd.concat([train_data_0_under, train_data_1], axis=0)
    return train_data_under


ori_demo_over = over_sampling(ori_demo, "exceeds50K")
ori_demo_under = under_sampling(ori_demo,"exceeds50K")

# demo_over["exceeds50K"].value_counts().plot(kind='bar', title='Count (output)');
# demo_under["exceeds50K"].value_counts().plot(kind='bar', title='Count (output)');

In [3]:
# results over train set
def test_clf_on_train(clf, train, output_label):
    x = train.drop(columns=output_label)
    y = train[output_label]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=10)
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test) 
    f1 = round(f1_score(y_test, y_pred, average='weighted') * 100, 2)
    acc = round(accuracy_score(y_test, y_pred) * 100, 2)
    return f1, acc


def show_result(data, Need_SVM=False):   
    if Need_SVM:
        model_names = ['KNN', 'LR', 'DT', 'RF', 'GBM','GaussianNB','SVM'] 
    else:
        model_names = ['KNN', 'LR', 'DT', 'RF', 'GBM','GaussianNB']
    all_acc = []
    all_f1 = []
    # 1. KNN
    clf_knn = KNeighborsClassifier()
    acc, f1 = test_clf_on_train(clf_knn, data, data.columns[-1])
    all_acc.append(acc)
    all_f1.append(f1)

    # 2. logistic regression
    clf_LR = LogisticRegression()
    acc, f1 = test_clf_on_train(clf_LR, data, data.columns[-1])
    all_acc.append(acc)
    all_f1.append(f1)

    # 3. DecisionTree
    clf_DT = DecisionTreeClassifier()
    acc, f1 = test_clf_on_train(clf_DT, data, data.columns[-1])
    all_acc.append(acc)
    all_f1.append(f1)

    # 4. RandomForest
    clf_RF = RandomForestClassifier()
    acc, f1 = test_clf_on_train(clf_RF, data, data.columns[-1])
    all_acc.append(acc)
    all_f1.append(f1)

    # 5. Gradient Boosting
    clf_GB = GradientBoostingClassifier()
    f1, acc = test_clf_on_train(clf_GB, data, data.columns[-1])
    all_acc.append(acc)
    all_f1.append(f1)
    
    # 6. Gaussian Naive Bayes
    clf_GussianNB = GaussianNB()
    f1, acc = test_clf_on_train(clf_GussianNB, data, data.columns[-1])
    all_acc.append(acc)
    all_f1.append(f1)
    
    # 7. SVM # SVM
    if Need_SVM:  
        svclassifier = svm.SVC(kernel='linear')
        acc,f1 = test_clf_on_train(svclassifier, data, data.columns[-1])
        all_acc.append(acc)
        all_f1.append(f1)
    
    results = pd.DataFrame({'Model': model_names, 'acc': all_acc, 'f1': all_f1})
    return results

In [None]:
# simple onehot preprocess
def train_data_simple_preprocess(train, output_label=""):
#     ori_train = train.copy()
    train = pd.get_dummies(train)
    train_input = train
    scalar = StandardScaler()
    scalar.fit(train.drop(columns=output_label))
    train_input = scalar.transform(train.drop(columns=output_label))
    
    new_train = pd.DataFrame(data=train_input, columns=train.drop(columns=output_label).columns)
    new_train[output_label] = train[output_label]
    return train_input, train[output_label], new_train

print(ori_demo.shape)
demo_input, demo_output, demo = train_data_simple_preprocess(ori_demo, "exceeds50K")
print(demo.shape)
# print&save onthot result
demo_over = over_sampling(demo, "exceeds50K")
demo_under = under_sampling(demo,"exceeds50K")
result = show_result(demo)
print("original:")
print(result)
result_over = show_result(demo_over)
print("original over:")
print(result_over)
result_under = show_result(demo_under)
print("original under:")
print(result_under)
result.to_csv("result_simple_onehot.csv")
result_over.to_csv("result_simple_onehot_over.csv")
result_under.to_csv("result_simple_onehot_under.csv") 

In [6]:
# simple onehot preprocess without normalization
def train_data_simple_without_normalization_preprocess(train, output_label=""):
#     ori_train = train.copy()
    train = pd.get_dummies(train)
    train_input = train
#     scalar = StandardScaler()
#     scalar.fit(train.drop(columns=output_label))
#     train_input = scalar.transform(train.drop(columns=output_label))
    
    new_train = pd.DataFrame(data=train_input, columns=train.drop(columns=output_label).columns)
    new_train[output_label] = train[output_label]
    return train_input, train[output_label], new_train

print(ori_demo.shape)
demo_input, demo_output, demo = train_data_simple_without_normalization_preprocess(ori_demo, "exceeds50K")
print(demo.shape)
# print&save onthot result
demo_over = over_sampling(demo, "exceeds50K")
demo_under = under_sampling(demo,"exceeds50K")
result = show_result(demo)
print("original:")
print(result)
result_over = show_result(demo_over)
print("original over:")
print(result_over)
result_under = show_result(demo_under)
print("original under:")
print(result_under)
result.to_csv("result_simple_onehot.csv")
result_over.to_csv("result_simple_onehot_over.csv")
result_under.to_csv("result_simple_onehot_under.csv") 

(24421, 14)
(24421, 103)
original:
        Model    acc     f1
0         KNN  74.62  77.22
1          LR  75.97  79.50
2          DT  81.34  81.31
3          RF  84.40  84.77
4         GBM  86.54  85.94
5  GaussianNB  79.17  76.32
original over:
        Model    acc     f1
0         KNN  72.77  72.93
1          LR  60.90  61.34
2          DT  91.13  91.15
3          RF  92.84  92.86
4         GBM  84.42  84.40
5  GaussianNB  62.65  58.60
original under:
        Model    acc     f1
0         KNN  61.87  61.89
1          LR  58.51  62.37
2          DT  80.10  80.11
3          RF  84.31  84.31
4         GBM  84.65  84.64
5  GaussianNB  63.25  59.24


In [None]:
# hash category preprocess
print(ori_demo.shape)
def train_data_with_hash_cate_preprocess(train, output_label="", threshold = 16, target_component = 16):
    ce_hash = ce.HashingEncoder()
    new_train = train.copy()
    new_train = new_train.drop(columns=output_label)
    for (column,dtype) in zip((train.drop(columns=output_label)).columns, (train.drop(columns=output_label)).dtypes):
#         print("column:",column)
        if dtype == "int64":
            continue
        if train[column].unique().shape[0] > threshold:
            temp_df = ce_hash.hashing_trick(train.loc[:, [column]], N=target_component)
            for i, temp_col in zip(range(target_component), temp_df.columns):
                new_train[column + str(i)] = temp_df[temp_col]
            new_train = new_train.drop(columns = column)
    new_train = pd.get_dummies(new_train)
    scalar = StandardScaler()
    scalar.fit(new_train)
    train_input = scalar.transform(new_train)
#     new_train = pd.DataFrame(data=train_input, columns=train.drop(columns=output_label).columns)
    new_train[output_label] = train[output_label]
    return train_input, train[output_label], new_train

demo_hash_input, demo_hash_output, demo_hash = train_data_with_hash_cate_preprocess(ori_demo, output_label = "exceeds50K")
demo_hash_over =  over_sampling(demo_hash, "exceeds50K")
demo_hash_under = under_sampling(demo_under, "exceeds50K")
# print&save hash result
result_hash = show_result(demo_hash)
print("hash:")
print(result_hash)
result_hash_over = show_result(demo_hash_over)
print("hash over:")
print(result_hash_over)
result_hash_under = show_result(demo_hash_under)
print("hash under:")
print(result_hash_under)
result_hash.to_csv("result_hash16.csv")
result_hash_over.to_csv("result_hash16_over.csv")
result_hash_under.to_csv("results_hash16_under.csv")

In [None]:
# target encode category preprocess
# target category, and normalize continuous

print(ori_demo.shape)
def train_data_with_target_cate_preprocess(train, output_label=""):
    ce_target = ce.TargetEncoder()
    new_train = train.copy()
    new_train = new_train.drop(columns=output_label)
    scalar = StandardScaler()
    continuous_train = train[continuous_features]
    scalar.fit(continuous_train)
    new_continuous_train = scalar.transform(continuous_train)
    category_train = train[category_features]
    for column in category_train.columns:
        ce_target.fit(category_train[column].values, train[output_label].values)
        temp_df = ce_target.transform(train[column].values, train[output_label].values)
        new_train[column] = temp_df
    new_train[continuous_features] = new_continuous_train
    new_train[output_label] = train[output_label]
    train_input = new_train.drop(columns=output_label)
    return train_input, train[output_label], new_train

demo_target_input, demo_target_output, demo_target = train_data_with_target_cate_preprocess(ori_demo, output_label = "exceeds50K")
# # --- End of your code ---
print(demo_target_input.shape)
demo_target.head()

# target show result
demo_target_over = over_sampling(demo_target,"exceeds50K")
demo_target_under = under_sampling(demo_target,"exceeds50K")
result_target = show_result(demo_target)
print("target encoder:")
print(result_target)
result_target_over = show_result(demo_target_over)
print("target encoder over:")
print(result_target_over)
result_target_under = show_result(demo_target_under)
print("target encoder under:")
print(result_target_under)
result_target.to_csv("result_target_encode.csv")
result_target_over.to_csv("result_target_encode_over.csv")
result_target_under.to_csv("result_target_encode_under.csv")

In [None]:
# leave one out encode category preprocess
# leave one out category, and normalize continuous

print(ori_demo.shape)
def train_data_with_leaveoneout_cate_preprocess(train, output_label=""):
    ce_leave = ce.LeaveOneOutEncoder()     
    # Must pass the series for y
    new_train = train.copy()
    new_train = new_train.drop(columns=output_label)
    scalar = StandardScaler()
    continuous_train = train[continuous_features]
    scalar.fit(continuous_train)
    new_continuous_train = scalar.transform(continuous_train)
    category_train = train[category_features]
    for column in category_train.columns:
        ce_leave.fit(category_train[column].values, train[output_label].values)
        temp_df = ce_leave.transform(train[column].values, train[output_label].values)
        new_train[column] = temp_df
    new_train[continuous_features] = new_continuous_train
    new_train[output_label] = train[output_label]
    train_input = new_train.drop(columns=output_label)
    return train_input, train[output_label], new_train

demo_leave_input, demo_leave_output, demo_leave = train_data_with_leaveoneout_cate_preprocess(ori_demo, output_label = "exceeds50K")
# # --- End of your code ---
print(demo_leave_input.shape)
demo_leave.head()

# print leaveoneout result
demo_leave_over = over_sampling(demo_leave, "exceeds50K")
demo_leave_under = under_sampling(demo_leave, "exceeds50K")
result_leave = show_result(demo_leave)
print("leavebutone:")
print(result_leave)
result_leave_over = show_result(demo_leave_over)
print("leavebutone over:")
print(result_leave_over)
result_leave_under = show_result(demo_leave_under)
print("leavebutone under:")
print(result_leave_under)
result_leave.to_csv("result_leave_encode.csv")
result_leave_over.to_csv("result_leave_encode_over.csv")
result_leave_under.to_csv("result_leave_encode_under.csv")

In [None]:
# smooth target  encode category preprocess
# target category, and normalize continuous

print(ori_demo.shape)
def train_data_with_target_cate_preprocess(train, output_label="", smooth = 10):
    ce_target = ce.TargetEncoder(smoothing = smooth)
    new_train = train.copy()
    new_train = new_train.drop(columns=output_label)
    scalar = StandardScaler()
    continuous_train = train[continuous_features]
    scalar.fit(continuous_train)
    new_continuous_train = scalar.transform(continuous_train)
    category_train = train[category_features]
    for column in category_train.columns:
        ce_target.fit(category_train[column].values, train[output_label].values)
        temp_df = ce_target.transform(train[column].values, train[output_label].values)
        new_train[column] = temp_df
    new_train[continuous_features] = new_continuous_train
    new_train[output_label] = train[output_label]
    train_input = new_train.drop(columns=output_label)
    return train_input, train[output_label], new_train

demo_sm_target_input, demo_sm_target_output, demo_sm_target = train_data_with_target_cate_preprocess(ori_demo, output_label = "exceeds50K")
print(demo_sm_target_input.shape)
demo_sm_target.head()

# print smooth_target result
demo_sm_target_over = over_sampling(demo_sm_target,"exceeds50K")
demo_sm_target_under = under_sampling(demo_sm_target,"exceeds50K")
result_sm_target = show_result(demo_sm_target)
print("smooth target encoder:")
print(result_sm_target)
result_sm_target_over = show_result(demo_sm_target_over)
print("smooth target encoder over:")
print(result_sm_target_over)
result_sm_target_under = show_result(demo_sm_target_under)
print("smooth target encoder under:")
print(result_sm_target_under)
result_sm_target.to_csv("result_smooth_target.csv")
result_sm_target_over.to_csv("result_smooth_target_over.csv")
result_sm_target_under.to_csv("result_smooth_target_under.csv") 

In [None]:
#james_stein
print(ori_demo.shape)
def train_data_with_james_cate_preprocess(train, output_label="", smooth = 10):
    ce_james = ce.james_stein.JamesSteinEncoder()
    new_train = train.copy()
    new_train = new_train.drop(columns=output_label)
    scalar = StandardScaler()
    continuous_train = train[continuous_features]
    scalar.fit(continuous_train)
    new_continuous_train = scalar.transform(continuous_train)
    category_train = train[category_features]
    for column in category_train.columns:
        ce_james.fit(category_train[column].values, train[output_label].values)
        temp_df = ce_james.transform(train[column].values, train[output_label].values)
        new_train[column] = temp_df
    new_train[continuous_features] = new_continuous_train
    new_train[output_label] = train[output_label]
    train_input = new_train.drop(columns=output_label)
    return train_input, train[output_label], new_train

demo_james_input, demo_james_output, demo_james = train_data_with_james_cate_preprocess(ori_demo, output_label = "exceeds50K")
print(demo_james_input.shape)
demo_james.head()

# print ames result
demo_james_over = over_sampling(demo_james,"exceeds50K")
demo_james_under = under_sampling(demo_james,"exceeds50K")
result_james = show_result(demo_james)
print("james encoder:")
print(result_james)
result_james_over = show_result(demo_james_over)
print("james encoder over:")
print(result_james_over)
result_james_under = show_result(demo_james_under)
print("james encoder under:")
print(result_james_under)
result_james.to_csv("result_james.csv")
result_james_over.to_csv("result_james_over.csv")
result_james_under.to_csv("result_james_under.csv") 

In [19]:
# deal with new catogory and test data preprocess
def preprocess_new_catogory(train, test, mode = 0):
    # must ensure train and test have same columns
    # mode = 0, use the most often value
    # mode = 1, use arbitrary value
    i = 0
    for column1, dtype, column2 in zip(train.columns, train.dtypes, test.columns):
        if column1!=column2:
            continue
        if dtype == "int64":
            continue
#             print("start!")
        catos = train[column1].unique()
#         print("column",column,"test[column]",test[column])
        temp = test[column2].apply(lambda x: False if x in catos else True)
#         print(temp)
        if test[temp].shape[0]>0:
            print(column1, column2, test[temp].shape[0]," needs change")
            candi = test.loc[temp, column2]
            print(candi, " not included")
            if mode == 0:
                counts = test[column2].value_counts().index.tolist()
                for count in counts:
                    if count in catos:
                        test.loc[temp, column2]=count
                        break
        i += 1
    print(i," columns have been detected!")
    return test

def test_data_simple_preprocess(test):
    test = pd.get_dummies(test)
    scalar = StandardScaler()
    scalar.fit(test)
    test_input = scalar.transform(test)
    new_test = pd.DataFrame(data=test_input, columns=test.columns)
    return new_test

def test_data_simple_without_normalization_preprocess(test):
    test_input = pd.get_dummies(test)
#     scalar = StandardScaler()
#     scalar.fit(test)
#     test_input = scalar.transform(test)
    new_test = pd.DataFrame(data=test_input, columns=test_input.columns)
    return new_test

def test_data_with_hash_cate_preprocess(test, threshold = 8, target_component = 8):
    ce_hash = ce.HashingEncoder()
    new_test = test.copy()
    for (column,dtype) in zip(test.columns, test.dtypes):
        print("column:",column)
        if dtype == "int64":
            continue
        if test[column].unique().shape[0] > threshold:
            temp_df = ce_hash.hashing_trick(test.loc[:, [column]], N=target_component)
            for i, temp_col in zip(range(target_component), temp_df.columns):
                new_test[column + str(i)] = temp_df[temp_col]
            new_test = new_test.drop(columns = column)
    new_test = pd.get_dummies(new_test)
    scalar = StandardScaler()
    scalar.fit(new_test)
    test_input = scalar.transform(new_test)
    return test_input


def test_data_with_target_cate_preprocess(test, train, output_label="exceeds50K"):
    new_test = test.copy()
    scalar = StandardScaler()
    continuous_test = test[continuous_features]
    scalar.fit(continuous_test)
    new_continuous_test = scalar.transform(continuous_test)
    category_test = test[category_features]
    for column in category_test.columns:
        ce_target = ce.TargetEncoder()
        ce_target.fit(train[column].values, train[output_label].values)
        temp_df = ce_target.transform(test[column].values, y=None)
        new_test[column] = temp_df
    new_test[continuous_features] = new_continuous_test
    return new_test


def test_data_with_leaveoneout_cate_preprocess(test, train, output_label="exceeds50K"):
    new_test = test.copy()
    scalar = StandardScaler()
    continuous_test = test[continuous_features]
    scalar.fit(continuous_test)
    new_continuous_test = scalar.transform(continuous_test)
    category_test = test[category_features]
    for column in category_test.columns:
        ce_leave = ce.LeaveOneOutEncoder() 
        ce_leave.fit(train[column].values, train[output_label].values)
        temp_df = ce_leave.transform(test[column].values, y=None)
        new_test[column] = temp_df
    new_test[continuous_features] = new_continuous_test
    return new_test


def test_data_with_sm_target_cate_preprocess(test, train, output_label="exceeds50K", smooth=10):
    new_test = test.copy()
    scalar = StandardScaler()
    continuous_test = test[continuous_features]
    scalar.fit(continuous_test)
    new_continuous_test = scalar.transform(continuous_test)
    category_test = test[category_features]
    for column in category_test.columns:
        ce_target = ce.TargetEncoder(smoothing=smooth)
        ce_target.fit(train[column].values, train[output_label].values)
        temp_df = ce_target.transform(test[column].values, y=None)
        new_test[column] = temp_df
    new_test[continuous_features] = new_continuous_test
    return new_test



In [20]:
# preprocess to test
test_input = preprocess_new_catogory(ori_demo, ori_test_input) 
test_input = test_data_simple_without_normalization_preprocess(test_input)
# test_input = test_data_with_sm_target_cate_preprocess(test_input, ori_demo, output_label="exceeds50K",smooth=1)
# print(test_input.columns)
# print(test_input.head())

7  columns have been detected!


In [24]:
# generate test prediction
demo_over = over_sampling(demo, "exceeds50K")
demo_under = under_sampling(demo, "exceeds50K")

clf_RF = RandomForestClassifier()
clf_RF.fit(demo_over.drop(columns="exceeds50K"), demo_over["exceeds50K"])

# clf_GBM = GradientBoostingClassifier()
# clf_GBM.fit(demo_target_over.drop(columns="exceeds50K"),demo_leave_over["exceeds50K"])

print(test_input)

test_y_pred = clf_RF.predict(test_input)
result = pd.DataFrame(data=test_y_pred, columns=["prediction"])
result["id"] = range(1, result.shape[0] + 1)
result.to_csv(r"RF_replaceQ_onehot_submission_new.csv",index=False)

# test_y_pred = clf_GBM.predict(test_input)
# result = pd.DataFrame(data=test_y_pred, columns=["prediction"])
# result["id"] = range(1, result.shape[0] + 1)
# result.to_csv("GBM_leave_over_sampling_submission.csv",index=False)

       age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0       23   32732             10             0             0              25   
1       69  165017              9          2538             0              40   
2       27   36440             13             0             0              40   
3       40  182217             10             0             0              40   
4       24   89347              7             0             0              40   
...    ...     ...            ...           ...           ...             ...   
24416   26  109186             10             0             0              50   
24417   52  254680              9             0             0              99   
24418   40  116218             10             0             0              45   
24419   29  253262             10             0             0              40   
24420   46  177536             10             0             0              60   

       workclass_ ?  workcl

In [None]:
def stacking(clfs, train_data, final_clf,output_label,test):
    num_clfs = len(clfs)
    data_size = train_data.shape[0] // num_clfs
    
#     print("train_data[output_label] before",train_data[output_label])
    temp_inputs = []
    for (clf,i) in zip(clfs, range(len(clfs))):
        cur_train_data = train_data.sample(data_size, replace=True)
        X = cur_train_data.drop(columns = output_label)
        y = cur_train_data[output_label]
        print("X,y shape",X.shape,y.shape)
        clf.fit(X,y)
        temp_input_data = clf.predict(train_data.drop(columns=output_label))
        temp_input = pd.DataFrame(data=temp_input_data, columns=["clf" + str(i)])
        temp_inputs.append(temp_input)
#         print("temp_inputs", temp_inputs)
    temp_input = pd.concat(temp_inputs,axis=1)
    temp_input.reset_index(drop=True, inplace=True)
    train_data.reset_index(drop=True, inplace=True)
#     print("temp_input",temp_input)
#     print("train_data[output_label]",train_data[output_label])
#     assert 1==0
    temp_input[output_label] = train_data[output_label]
#     print(temp_input)
#     temp_input[output_label] = train_data[output_label]
    final_clf.fit(temp_input.drop(columns=output_label), temp_input[output_label].data)
    
    temp_tests = []
    for (clf,i) in zip(clfs, range(len(clfs))):
        temp_test = pd.DataFrame(data=clf.predict(test), columns=["clf" + str(i)])
        temp_tests.append(temp_test)
    temp_test = pd.concat(temp_tests,axis=1)
    return final_clf.predict(temp_test)


# stacking 
clf_RF_1 = RandomForestClassifier()
clf_RF_2 = RandomForestClassifier()
clf_RF_3 = RandomForestClassifier()
clf_RF_4 = RandomForestClassifier()
y_pred = stacking([clf_RF_1, clf_RF_2, clf_RF_3], demo, clf_RF_4, "exceeds50K",test_input )
result = pd.DataFrame(data=test_y_pred, columns=["prediction"])
result["id"] = range(1, result.shape[0] + 1)
result.to_csv("Stacking(RF,RF,RF)RF_onehot_submission.csv",index=False)

In [None]:
# votingclassifier
ori_demo = pd.read_csv("../data/train.csv")
ori_test_input = pd.read_csv("../data/test.csv")
def basic_preprocess(data):
    for column in category_features:
        mean_value = data[column].value_counts().index[0]
#         print("mean", mean_value)
        data[column] = data[column].replace(regex=r'^.*\?.*$',value=mean_value)
#         print("column name:",column,", all values:", data[column].unique())
    return data
ori_demo  = basic_preprocess(ori_demo)
X = ori_demo.drop(columns="exceeds50K")
y = ori_demo["exceeds50K"]
test_input = basic_preprocess(ori_test_input)
test_input = preprocess_new_catogory(train=ori_demo, test = test_input)
test_input = pd.get_dummies(test_input)
# X = basic_preprocess(X)
X = pd.get_dummies(X)
print(X.shape)
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1,n_estimators=20)
clf3 = GaussianNB()
clf4 = RandomForestClassifier(random_state=1, n_estimators=21)
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3),('rf2', clf4)], voting='hard')  # 无权重投票

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=10)
eclf.fit(x_train,y_train)
y_pred = eclf.predict(x_test) 
f1 = round(f1_score(y_test, y_pred, average='weighted') * 100, 2)
acc = round(accuracy_score(y_test, y_pred) * 100, 2)
print("f1",f1,"  acc",acc)
eclf.fit(X,y)
test_y_pred = eclf.predict(test_input)
result = pd.DataFrame(data=test_y_pred, columns=["prediction"])
result["id"] = range(1, result.shape[0] + 1)
result.to_csv(r"RF_replaceQ_onehot_votingClf_submission.csv",index=False)
# 配合网格搜索
from sklearn.model_selection import GridSearchCV
params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [5, 200],}  # 搜索寻找最优的lr模型中的C参数和rf模型中的n_estimators
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X, y)
print('最优参数：',grid.best_params_)
