In [1]:
import os
import time
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier, LGBMRegressor

from Dataset_Construction import Balance_Ratio 
from Sampling import label_divide
from AdaClassifier import train_set, multiple_set, print_badC, bad_plot, line_chart, cf_matrix
from AdaRegressor import AUC, PR_curve, multiple_curve, PR_matrix, best_threshold 
from Aging_Score import score1

os.chdir('C:/Users/Darui Yen/OneDrive/桌面/data_after_mid') 
os.getcwd()

'C:\\Users\\Darui Yen\\OneDrive\\桌面\\data_after_mid'

### Boosting model

In [3]:
def LightGBMC(train_x, test_x, train_y, test_y):
    
    clf = LGBMClassifier()
    clf.fit(train_x, train_y)
    predict_y = clf.predict(test_x)
    result = pd.DataFrame({'truth': test_y, 'predict': predict_y})
    
    return result


def LightGBMR(train_x, test_x, train_y, test_y):
    
    reg = LGBMRegressor()
    reg.fit(train_x, train_y)
    predict_y = reg.predict(test_x)
    result = pd.DataFrame({'truth': test_y, 'predict': predict_y})
    
    return result

### Run all dataset

In [4]:
def runall_LightGBMC(num_set, trainset_x, test_x, trainset_y, test_y, record_bad = True):
    
    table_set = pd.DataFrame()
    bad_set = pd.DataFrame()

    for i in range(num_set):
        print('\n', f'Dataset {i}:')
        
        result = LightGBMC(trainset_x[f'set{i}'], test_x, trainset_y[f'set{i}'], test_y)
        table = cf_matrix(result, trainset_y[f'set{i}'])
        table_set = pd.concat([table_set, table]).rename(index = {0: f'dataset {i}'})
        
        if record_bad:
            bad_table = print_badC(result, test_x, Bad_Types) 
            bad_set = pd.concat([bad_set, bad_table]).rename(index = {0: f'dataset {i}'})

    if record_bad:
        return table_set, bad_set
    else:
        return table_set
    
    
def runall_LightGBMR(num_set, trainset_x, test_x, trainset_y, test_y, thres_target = 'Recall', threshold = 0.8, 
                     record_bad = True):
    
    table_set = pd.DataFrame()
    bad_set = pd.DataFrame()
    pr_dict = {}

    for i in range(num_set):
        print('\n', f'Dataset {i}:')

        predict = LightGBMR(trainset_x[f'set{i}'], test_x, trainset_y[f'set{i}'], test_y)
        pr_matrix = PR_matrix(predict, trainset_y[f'set{i}'])
        pr_dict[f'set{i}'] = pr_matrix
        
        best_data, best_thres = best_threshold(pr_matrix, target = thres_target, threshold = threshold)
        table_set = pd.concat([table_set, best_data]).rename(index = {best_data.index.values[0]: f'dataset {i}'})
        
        if record_bad:
            bad_table = print_badC(predict, test_x, Bad_Types, threshold = best_thres)
            bad_set = pd.concat([bad_set, bad_table]).rename(index = {0: f'dataset {i}'})
    
    if record_bad:
        return pr_dict, table_set, bad_set
    else:
        return pr_dict, table_set

## Data Processing

In [2]:
###bad types###
bad = pd.read_csv('original_data/Bad_Types.csv').iloc[:, 1:]
Bad_Types = {bad.cb[i]:i for i in range (len(bad))}
print('Total bad types:', len(bad))

###single dataset###
test = pd.read_csv('original_data/TestingSet_0.csv').iloc[:, 2:]
train = pd.read_csv('original_data/TrainingSet_new.csv').iloc[:, 2:]
print('\ntraining data:', train.shape, '\nBalance Ratio:', Balance_Ratio(train))
print('\ntesting data:', test.shape, '\nBalance Ratio:', Balance_Ratio(test))

train_x, train_y, test_x, test_y = label_divide(train, test, 'GB')

###multiple dataset###
data_dict = multiple_set(num_set = 9)
trainset_x, trainset_y = train_set(data_dict, num_set = 9, label = 'GB')
test_x, test_y = label_divide(test, None, 'GB', train_only = True)

Total bad types: 62

training data: (77138, 83) 
Balance Ratio: 18.17902

testing data: (55903, 83) 
Balance Ratio: 3104.72222
Dimension of dataset 0 : (80395, 690)  balance ratio: 902.31461
Dimension of dataset 1 : (120449, 690)  balance ratio: 2.0005
Dimension of dataset 2 : (120503, 690)  balance ratio: 1.99781
Dimension of dataset 3 : (120503, 690)  balance ratio: 1.99781
Dimension of dataset 4 : (120459, 690)  balance ratio: 2.0
Dimension of dataset 5 : (654, 690)  balance ratio: 2.12919
Dimension of dataset 6 : (712, 690)  balance ratio: 1.66667
Dimension of dataset 7 : (712, 690)  balance ratio: 1.66667
Dimension of dataset 8 : (667, 690)  balance ratio: 2.0045

 9 datasets are loaded.

Labels of  9 datasets are divided.


In [5]:
start = time.time()

table_set1, bad_set1 = runall_LightGBMC(9, trainset_x, test_x, trainset_y, test_y)
line_chart(table_set1, title = 'CatBoost')
bad_plot(bad_set1)

end = time.time()
print("\nRun Time：%f seconds" % (end - start))


 Dataset 0:


ValueError: Number of features of the model must match the input. Model n_features_ is 689 and input n_features is 82 

In [None]:
table_set1

In [None]:
start = time.time()

pr_dict, table_set2, bad_set2 = runall_LightGBMR(9, trainset_x, test_x, trainset_y, test_y, thres_target = 'Recall', 
                                                 threshold = 0.8)
line_chart(table_set2, title = 'LightGBM')
bad_plot(bad_set2)

end = time.time()
print("\nRun Time：%f seconds" % (end - start))

In [None]:
multiple_curve(3, 3, pr_dict, table_set2, target = 'Aging Rate')
multiple_curve(3, 3, pr_dict, table_set2, target = 'Precision')
table_set2