In [9]:
import pandas as pd
import numpy as np


#Load data
data_training=pd.read_csv('census-income-training.csv',header=0)

#Clean data
#replace invalid or missing entries(delete individual row)
data_training_clean=data_training.replace(regex=[r'\?'],value=np.nan)
data_training=data_training_clean.dropna(how='any')


# Split the data into features and target label
income_raw = data_training['income_morethan_50K']
features_raw = data_training.drop(['income_morethan_50K','Id','GRINREG','MIGMTR1','MIGMTR3','MIGSAME','PARENT','PEFNTVTY','PEMNTVTY','PENATVTY'],axis=1)


# Visualize skewed continuous features of original data
#vs.distribution(data)

# Log-transform the skewed features
skewed = ['AHRSPAY','CAPGAIN', 'CAPLOSS','DIVVAL']
features_log_transformed = pd.DataFrame(features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

#Normalizing Numerical Features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['AAGE', 'ADTOCC', 'AHRSPAY', 'CAPGAIN','CAPLOSS', 'DIVVAL','HHDREL','NOEMP']
features_log_minmax_transform = pd.DataFrame(features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

# Data Preprocessing: One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
features_final = pd.get_dummies(features_log_minmax_transform)
income_final= income_raw

# Print the number of features after one-hot encoding
#encoded = list(features_final.columns)
#print ' total features after one-hot encoding.'
#display (encoded)

#Split the Data into Training and Test Sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_final,income_final,test_size = 0.2,random_state = 0)

#Light GTB
# from sklearn.datasets import make_hastie_10_2
# from sklearn.ensemble import GradientBoostingClassifier

import lightgbm as lgb
lgb_train = lgb.Dataset(x_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)  # 创建验证数据

params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression', # 目标函数
    'metric': {'l2', 'auc'},  # 评估函数
    'num_leaves': 31,   # 叶子节点数
    'learning_rate': 0.05,  # 学习速率
    'feature_fraction': 0.9, # 建树的特征选择比例
    'bagging_fraction': 0.8, # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}

gbm = lgb.train(params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5)


gbm_y_predict=gbm.predict(x_test)




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1485
[LightGBM] [Info] Number of data points in the train set: 53856, number of used features: 154
[LightGBM] [Info] Start training from score 0.080288
[1]	valid_0's auc: 0.86488	valid_0's l2: 0.0687084
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.89255	valid_0's l2: 0.0666291
[3]	valid_0's auc: 0.892983	valid_0's l2: 0.0646955
[4]	valid_0's auc: 0.893603	valid_0's l2: 0.0629607
[5]	valid_0's auc: 0.894677	valid_0's l2: 0.0616121
[6]	valid_0's auc: 0.904556	valid_0's l2: 0.060149
[7]	valid_0's auc: 0.904509	valid_0's l2: 0.0590332
[8]	valid_0's auc: 0.906808	valid_0's l2: 0.0578026
[9]	valid_0's auc: 0.908183	valid_0's l2: 0.056635
[10]	valid_0's auc: 0.910775	valid_0's l2: 0.0555902
[11]	valid_0's auc: 0.910126	valid_0's l2: 0.0546648
[12]	valid_0's auc: 0.912847	valid_0's l2: 0.053816
[13]	valid_0's 