In [1]:
from tqdm import tqdm
import resampy
import os
import sklearn.preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout
from tensorflow.keras.utils import to_categorical 
import tensorflow as tf
import tensorflow.keras
import lightgbm as lgb



## 读取数据

In [2]:
X = np.load("X.npy")
Y = np.load("Y.npy")

In [3]:
# X_train即train_x，训练集的输入，其他类似 random_state 都设置为2024
X_train_valid, X_test, Y_train_valid, Y_test = train_test_split(X, Y, random_state = 2024, test_size=0.2,stratify=Y)

In [4]:
X_train,X_valid, Y_train,Y_valid = train_test_split(X_train_valid, Y_train_valid, random_state = 2024, test_size=0.25,stratify=Y_train_valid)

In [5]:
X_train.shape,Y_train.shape

((6684, 128), (6684,))

In [6]:
X_valid.shape,Y_valid.shape

((2228, 128), (2228,))

In [7]:
X_test.shape,Y_test.shape

((2228, 128), (2228,))

## LightGbm

In [8]:
# lightgbm模型 
# 返回预测结果（以概率的形式）和训练集十折训练在测试集上的准确率列表
def lgb_model(train_x, train_y, test_x):
    import lightgbm as lgb
    folds = 10
    # 10折交叉验证
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=2023)
    # 
    predictions = np.zeros((test_x.shape[0],20))
    cv_scores = []
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        #print('it {} / 10 '.format(str(i+1)))
        train_sub_x, train_sub_y = train_x[train_index], train_y[train_index]
        valid_sub_x, valid_sub_y = train_x[valid_index], train_y[valid_index]
        train_matrix = lgb.Dataset(train_sub_x, label=train_sub_y)
        valid_matrix = lgb.Dataset(valid_sub_x, label=valid_sub_y)
        params = {
            'boosting_type': 'gbdt',
            'objective': 'multiclass',
            'metric': 'multi_error',
            'min_child_weight': 5,
            'num_leaves': 2 ** 4,
            'lambda_l2': 13,
            'feature_fraction': 0.6,
            'bagging_fraction': 0.7,
            'bagging_freq': 2,
            'learning_rate': 0.1,
            'seed': 2023,
            'nthread': 24,
            'n_jobs':24,
            'silent': True,
            'verbose': -1,
            'num_class':20,
        } # 模型参数
        model = lgb.train(params, 
                          train_matrix, 50000, 
                          valid_sets=[train_matrix, valid_matrix], 
                          verbose_eval=200,
                          early_stopping_rounds=200,
                         )
        # model.save_model("lgb_model_"+str(i)+".txt")
        valid_pred = model.predict(valid_sub_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        # 十折平均投票确定预测值
        predictions += test_pred / kf.n_splits
        valid_score = accuracy_score(valid_sub_y, np.argmax(valid_pred, axis=1))
        cv_scores.append(valid_score)
        # 保存最佳模型
        if not os.path.exists("lgb_best_config.txt"):
            model.save_model("lgb_best_config.txt")
        else:
            if cv_scores[i] > cv_scores[i - 1]:
                model.save_model("lgb_best_config.txt")
    #print(cv_scores)
    #print(np.mean(cv_scores))
    #print(np.std(cv_scores))
    return predictions,cv_scores

In [9]:
# 可视化函数 可以选择使用或不使用
from pyecharts.charts import Line
import pyecharts.options as opts
def show(scores):
    columns = ["第{}次".format(i) for i in range(1,len(scores)+1)]
    l = (
        Line()
        .add_xaxis(columns)
        .add_yaxis("准确度", 
                   ['{:.4f}'.format(i) for i in scores],
                   markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(type_='average')]),
                  )
        .set_global_opts(title_opts=opts.TitleOpts(title="LGB模型在训练集上10折验证的表现"),
                        yaxis_opts=opts.AxisOpts( min_='dataMin'),
                        )
    )
    l.render("LGB.html")

In [10]:
%%time

Y_pred,scores = lgb_model(X_train,Y_train,X_valid)

Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00315877	valid_1's multi_error: 0.258595
[400]	training's multi_error: 0	valid_1's multi_error: 0.248132
Early stopping, best iteration is:
[261]	training's multi_error: 0.00133001	valid_1's multi_error: 0.245142


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00548628	valid_1's multi_error: 0.295964
[400]	training's multi_error: 0.000997506	valid_1's multi_error: 0.278027
Early stopping, best iteration is:
[310]	training's multi_error: 0.00149626	valid_1's multi_error: 0.275037


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00382377	valid_1's multi_error: 0.2571
[400]	training's multi_error: 0.000665004	valid_1's multi_error: 0.249626
[600]	training's multi_error: 0	valid_1's multi_error: 0.251121
Early stopping, best iteration is:
[514]	training's multi_error: 0.000166251	valid_1's multi_error: 0.239163


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00465503	valid_1's multi_error: 0.261584
[400]	training's multi_error: 0.000498753	valid_1's multi_error: 0.264574
Early stopping, best iteration is:
[300]	training's multi_error: 0.000997506	valid_1's multi_error: 0.251121


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00415559	valid_1's multi_error: 0.281437
[400]	training's multi_error: 0.000332447	valid_1's multi_error: 0.27994
Early stopping, best iteration is:
[342]	training's multi_error: 0.000831117	valid_1's multi_error: 0.272455


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00415559	valid_1's multi_error: 0.278443
[400]	training's multi_error: 0.00049867	valid_1's multi_error: 0.261976
Early stopping, best iteration is:
[299]	training's multi_error: 0.00116356	valid_1's multi_error: 0.252994


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00365691	valid_1's multi_error: 0.285928
[400]	training's multi_error: 0	valid_1's multi_error: 0.267964
[600]	training's multi_error: 0	valid_1's multi_error: 0.263473
Early stopping, best iteration is:
[482]	training's multi_error: 0	valid_1's multi_error: 0.258982


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00432181	valid_1's multi_error: 0.252994
Early stopping, best iteration is:
[146]	training's multi_error: 0.014129	valid_1's multi_error: 0.245509


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00465426	valid_1's multi_error: 0.287425
[400]	training's multi_error: 0.000166223	valid_1's multi_error: 0.276946
Early stopping, best iteration is:
[280]	training's multi_error: 0.00166223	valid_1's multi_error: 0.267964


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00398936	valid_1's multi_error: 0.281437
[400]	training's multi_error: 0.000332447	valid_1's multi_error: 0.261976
Early stopping, best iteration is:
[379]	training's multi_error: 0.000831117	valid_1's multi_error: 0.257485
CPU times: total: 27min 51s
Wall time: 1min 55s


In [11]:
pred = np.argmax(Y_pred, axis=1)
print(accuracy_score(Y_valid,pred))
# 在验证集上的表现，用于调整超参数

0.7248653500897666


In [12]:
%%time

lgb_pred,scores_2 = lgb_model(X_train_valid,Y_train_valid,X_test)

Training until validation scores don't improve for 200 rounds


Please use silent argument of the Dataset constructor to pass this parameter.


[200]	training's multi_error: 0.00361596	valid_1's multi_error: 0.253363
[400]	training's multi_error: 0.00074813	valid_1's multi_error: 0.238789
[600]	training's multi_error: 0.000374065	valid_1's multi_error: 0.234305
[800]	training's multi_error: 0.000124688	valid_1's multi_error: 0.230942
[1000]	training's multi_error: 0.000249377	valid_1's multi_error: 0.229821
Early stopping, best iteration is:
[853]	training's multi_error: 0.000249377	valid_1's multi_error: 0.226457


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00411471	valid_1's multi_error: 0.238789
[400]	training's multi_error: 0.000872818	valid_1's multi_error: 0.2287
[600]	training's multi_error: 0.000623441	valid_1's multi_error: 0.224215
Early stopping, best iteration is:
[588]	training's multi_error: 0.000623441	valid_1's multi_error: 0.223094


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00349084	valid_1's multi_error: 0.267116
[400]	training's multi_error: 0.000623364	valid_1's multi_error: 0.236813
[600]	training's multi_error: 0.000498691	valid_1's multi_error: 0.24018
Early stopping, best iteration is:
[529]	training's multi_error: 0.000498691	valid_1's multi_error: 0.232323


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00336616	valid_1's multi_error: 0.264871
[400]	training's multi_error: 0.000997382	valid_1's multi_error: 0.250281
[600]	training's multi_error: 0.000374018	valid_1's multi_error: 0.244669
[800]	training's multi_error: 0.000249345	valid_1's multi_error: 0.241302
Early stopping, best iteration is:
[688]	training's multi_error: 0.000249345	valid_1's multi_error: 0.24018


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00361551	valid_1's multi_error: 0.244669
[400]	training's multi_error: 0.000872709	valid_1's multi_error: 0.239057
[600]	training's multi_error: 0.000249345	valid_1's multi_error: 0.234568
Early stopping, best iteration is:
[447]	training's multi_error: 0.000872709	valid_1's multi_error: 0.230079


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00349084	valid_1's multi_error: 0.242424
[400]	training's multi_error: 0.000872709	valid_1's multi_error: 0.224467
[600]	training's multi_error: 0.000374018	valid_1's multi_error: 0.225589
Early stopping, best iteration is:
[451]	training's multi_error: 0.000623364	valid_1's multi_error: 0.216611


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00349084	valid_1's multi_error: 0.250281
[400]	training's multi_error: 0.000997382	valid_1's multi_error: 0.231201
[600]	training's multi_error: 0.000498691	valid_1's multi_error: 0.231201
Early stopping, best iteration is:
[425]	training's multi_error: 0.000872709	valid_1's multi_error: 0.225589


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.0027428	valid_1's multi_error: 0.253648
[400]	training's multi_error: 0.000623364	valid_1's multi_error: 0.246914
[600]	training's multi_error: 0.000374018	valid_1's multi_error: 0.244669
Early stopping, best iteration is:
[449]	training's multi_error: 0.000498691	valid_1's multi_error: 0.242424


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.0041142	valid_1's multi_error: 0.24018
[400]	training's multi_error: 0.000748036	valid_1's multi_error: 0.230079
[600]	training's multi_error: 0.000374018	valid_1's multi_error: 0.224467
[800]	training's multi_error: 0.000249345	valid_1's multi_error: 0.223345
Early stopping, best iteration is:
[660]	training's multi_error: 0.000249345	valid_1's multi_error: 0.2211


Please use silent argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[200]	training's multi_error: 0.00336616	valid_1's multi_error: 0.255892
[400]	training's multi_error: 0.000748036	valid_1's multi_error: 0.243547
[600]	training's multi_error: 0.000249345	valid_1's multi_error: 0.24018
[800]	training's multi_error: 0.000249345	valid_1's multi_error: 0.231201
Early stopping, best iteration is:
[760]	training's multi_error: 0.000249345	valid_1's multi_error: 0.231201
CPU times: total: 45min 17s
Wall time: 3min 4s


In [13]:
lgb_score = accuracy_score(Y_test,np.argmax(lgb_pred,axis=1))
print(lgb_score)
# 合并训练集和验证集作为新的训练集，在测试集上测试准确率，用于作为最终模型和预测结果

0.7921903052064632


In [14]:
show(scores_2)

In [15]:
scores_2

[0.773542600896861,
 0.7769058295964125,
 0.7676767676767676,
 0.7598204264870931,
 0.7699214365881033,
 0.7833894500561167,
 0.7744107744107744,
 0.7575757575757576,
 0.7789001122334456,
 0.7687991021324355]

In [16]:
# 分类报告
labels = ['aloe', 'burger', 'cabbage','candied_fruits', 'carrots', 'chips',
                  'chocolate', 'drinks', 'fries', 'grapes', 'gummies', 'ice-cream',
                  'jelly', 'noodles', 'pickles', 'pizza', 'ribs', 'salmon',
                  'soup', 'wings']
print(classification_report(Y_test, np.argmax(lgb_pred,axis=1),target_names=labels,digits=4))

                precision    recall  f1-score   support

          aloe     0.7064    0.7064    0.7064       109
        burger     0.7350    0.7227    0.7288       119
       cabbage     0.8269    0.8600    0.8431       100
candied_fruits     0.8261    0.9441    0.8812       161
       carrots     0.8525    0.7879    0.8189       132
         chips     0.8408    0.9167    0.8771       144
     chocolate     0.7045    0.5345    0.6078        58
        drinks     0.8868    0.8103    0.8468        58
         fries     0.8333    0.8140    0.8235       129
        grapes     0.7857    0.8534    0.8182       116
       gummies     0.7378    0.8897    0.8067       136
     ice-cream     0.8456    0.8690    0.8571       145
         jelly     0.6623    0.5795    0.6182        88
       noodles     0.6582    0.6341    0.6460        82
       pickles     0.8663    0.8563    0.8613       174
         pizza     0.8286    0.7131    0.7665       122
          ribs     0.7640    0.6939    0.7273  

## GaussianNB

In [17]:
gnb=GaussianNB()
gnb_model=gnb.fit(X_train_valid,Y_train_valid)
gnb_score = gnb_model.score(X_test,Y_test)
gnb_score

0.16741472172351884

## RandomForest

In [18]:
rfc=RandomForestClassifier(n_estimators=1200)
rfc_model=rfc.fit(X_train_valid,Y_train_valid)
rfc_score = rfc_model.score(X_test,Y_test)
rfc_score

0.7477558348294434

## AdaBoost

In [19]:
adc=AdaBoostClassifier(n_estimators=50)
adc_model=adc.fit(X_train_valid,Y_train_valid)
adc_score = adc_model.score(X_test,Y_test)
adc_score

0.20062836624775585

## SVC

In [20]:
#支持向量机
svc = SVC()
svc_model=svc.fit(X_train_valid,Y_train_valid)
svc_score = svc_model.score(X_test,Y_test)
svc_score

0.3779174147217235

## Knn

In [21]:
knc = KNeighborsClassifier(n_neighbors=5)
knc_model=knc.fit(X_train_valid,Y_train_valid)
knc_score = knc_model.score(X_test,Y_test)
knc_score

0.5264811490125674

In [22]:
lda = LinearDiscriminantAnalysis(solver='svd')
lda_model=lda.fit(X_train_valid,Y_train_valid)
lda_score = lda_model.score(X_test,Y_test)
lda_score

0.31867145421903054

## 保存结果

In [23]:
# 从准确率上看，将随机森林模型的预测结果保存
rfc_pred = rfc_model.predict_proba(X_test)
scores = [lgb_score,gnb_score,rfc_score,adc_score,svc_score,knc_score,lda_score]

In [24]:
np.save("lgb_pred.npy",lgb_pred)
np.save("rfc_pred.npy",rfc_pred)

In [25]:
with open("model_scores.txt","w",encoding="utf-8") as f:
    for score in scores:
        f.write(str(score) + "\t")