In [65]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV  # Perforing grid search
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train_data = pd.read_csv('drive/MyDrive/Colab Notebooks/train_final.csv')   # 读取训练数据
test_data = pd.read_csv('drive/MyDrive/Colab Notebooks/test_final.csv') 


In [52]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
#通过粗调之后使用Gridsearch精调
parameters = {
              'max_depth': [5],
              'num_leaves':[31,32],
              'learning_rate': [0.1],
              'feature_fraction': [0.6],
              'bagging_fraction': [0.9],
              'n_estimators':[150,200],
              'lambda_l1':[0.6],
              'lambda_l2':[0],
              'bagging_freq':[2,4]           
}
gbm = lgb.LGBMClassifier(boosting_type='dart',
                         objective = 'binary',
                         metric = 'auc',
                         device_tpye='gpu',
                         n_jobs= -1)
gsearch = GridSearchCV(gbm, param_grid=parameters, scoring='accuracy', cv=5,verbose=5)
                                  

In [67]:
#原始数据划分x,y
def origin_data(data):
  y = data.pop('loan_status').values   # 用pop方式将训练数据中的标签值y取出来，作为训练目标，这里的‘30’是标签的列名
  col = data.columns
  x = data[col].values  # 剩下的列作为训练数据
  return [x,y]

In [68]:
#原始数据跑模型
def run_model_with_origin_data():
  original_data=origin_data(train_data)
  x=original_data[0]
  y=original_data[1]

  gsearch.fit(x,y)

  print("Best score: %0.6f" % gsearch.best_score_)
  print("Best parameters set:")
  best_parameters = gsearch.best_estimator_.get_params()
  for param_name in sorted(parameters.keys()):
      print("\t%s: %r" % (param_name, best_parameters[param_name]))
  best_model=gsearch.best_estimator_ 
  print(best_model)
  original_data_test=origin_data(test_data)
  x_test=original_data_test[0]
  y_test=original_data_test[1]
  predict_y=best_model.predict(x_test) 
  accuracy=accuracy_score(y_test, predict_y)
  print('accuracy:%.6f'% accuracy)
  return accuracy

In [69]:
#原始数据模型准确率
accuracy=run_model_with_origin_data()

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.925, total=   1.7s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.8s remaining:    0.0s


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.920, total=   1.8s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.5s remaining:    0.0s


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.917, total=   1.7s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.3s remaining:    0.0s


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.917, total=   1.8s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.1s remaining:    0.0s


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.917, total=   1.8s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=32 
[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=32, score=0.925, total=   1.9s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=32 
[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=32, score=0.919, total=   1.9s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.5min finished


Best score: 0.920080
Best parameters set:
	bagging_fraction: 0.9
	bagging_freq: 4
	feature_fraction: 0.6
	lambda_l1: 0.6
	lambda_l2: 0
	learning_rate: 0.1
	max_depth: 5
	n_estimators: 200
	num_leaves: 31
LGBMClassifier(bagging_fraction=0.9, bagging_freq=4, boosting_type='dart',
               class_weight=None, colsample_bytree=1.0, device_tpye='gpu',
               feature_fraction=0.6, importance_type='split', lambda_l1=0.6,
               lambda_l2=0, learning_rate=0.1, max_depth=5, metric='auc',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=200, n_jobs=-1, num_leaves=31, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
accuracy:0.917780


In [70]:
#再次读取数据
train_data = pd.read_csv('drive/MyDrive/Colab Notebooks/train_final.csv')  
test_data = pd.read_csv('drive/MyDrive/Colab Notebooks/test_final.csv') 

In [71]:
#构造衍生变量continuous_installment_income_ratio(ciir):
def add_ciir(data):

  monthly_income=data.continuous_annual_inc/12
  continuous_installment_income_ratio=data.continuous_installment/monthly_income
  continuous_installment_income_ratio=round(continuous_installment_income_ratio,2)
  try:
    data.insert(146,'continuous_installment_income_ratio',continuous_installment_income_ratio)#插入新增变量2数据
  except:
    print("already exists")
  data['continuous_installment_income_ratio']=data['continuous_installment_income_ratio'].astype('float64')

  y = data.pop('loan_status').values   # 用pop方式将训练数据中的标签值y取出来，作为训练目标，这里的‘30’是标签的列名
  col = data.columns
  x = data[col].values  # 剩下的列作为训练数据
  return [x,y]

In [72]:
#使用增加这个衍生变量后的数据跑模型
def run_model_with_added_variable():


  added_ciir_data=add_ciir(train_data)
  x=added_ciir_data[0]
  y=added_ciir_data[1]

  gsearch.fit(x,y)

  print("Best score: %0.6f" % gsearch.best_score_)
  print("Best parameters set:")
  best_parameters = gsearch.best_estimator_.get_params()
  for param_name in sorted(parameters.keys()):
      print("\t%s: %r" % (param_name, best_parameters[param_name]))
  best_model=gsearch.best_estimator_ 
  print(best_model)
  added_ciir_data_test=add_ciir(test_data)
  x_test=added_ciir_data_test[0]
  y_test=added_ciir_data_test[1]
  predict_y=best_model.predict(x_test) 
  accuracy_added=accuracy_score(y_test, predict_y)
  print('accuracy:%.6f'% accuracy_added)
  return accuracy_added

In [73]:
#增加衍生变量后的数据模型准确率
accuracy_added=run_model_with_added_variable()
#增加衍生变量后的数据模型准确率提升值
print(accuracy_added-accuracy)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.925, total=   1.9s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s remaining:    0.0s


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.919, total=   1.8s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.7s remaining:    0.0s


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.916, total=   1.8s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.5s remaining:    0.0s


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.917, total=   1.8s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    7.3s remaining:    0.0s


[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=31, score=0.917, total=   1.8s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=32 
[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=32, score=0.927, total=   1.8s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=32 
[CV]  bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n_estimators=150, num_leaves=32, score=0.920, total=   1.8s
[CV] bagging_fraction=0.9, bagging_freq=2, feature_fraction=0.6, lambda_l1=0.6, lambda_l2=0, learning_rate=0.1, max_depth=5, n

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  1.5min finished


Best score: 0.919700
Best parameters set:
	bagging_fraction: 0.9
	bagging_freq: 2
	feature_fraction: 0.6
	lambda_l1: 0.6
	lambda_l2: 0
	learning_rate: 0.1
	max_depth: 5
	n_estimators: 150
	num_leaves: 32
LGBMClassifier(bagging_fraction=0.9, bagging_freq=2, boosting_type='dart',
               class_weight=None, colsample_bytree=1.0, device_tpye='gpu',
               feature_fraction=0.6, importance_type='split', lambda_l1=0.6,
               lambda_l2=0, learning_rate=0.1, max_depth=5, metric='auc',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=150, n_jobs=-1, num_leaves=32, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
accuracy:0.918140
0.00035999999999991594
