# 1.引包

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import time
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import lightgbm as lgb

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

BASE_PATH = "/kaggle/"
INPUT_PATH = "input/tabular-playground-series-apr-2022/"
OUTPUT_PATH = "working/"
TRAIN_NAME = "train.csv"
TRAIN_RESULTS_NAME = "train_labels.csv"
TEST_NAME = "test.csv"
SAMPLE_SUBMISSION = "sample_submission.csv"
SUBMISSION = "submission.csv"

In [None]:
!ls /kaggle/input/tabular-playground-series-apr-2022

# 2.处理特征

1. 相同sensor在60秒之间值变化的简单pattern，max、min、mean、median、mean square(or std)--准确率约79.2%
2. 相同sensor在60秒之间值变化的更多pattern，max-min、25%位数、75%位数、max所在秒数、min所在秒数--准确率约80.4%
3. 改善模型，使用kfold+lgbm以及调优参数--准确率约89.6%
4. todo:改善模型，不使用kfold只用上面的超参数--准确率约88.1%
5. todo:改善模型，增加GridSearchCV（Grid Search with Cross Validation）进行调参
6. todo:考虑相同sensor在60秒之间值变化的时序
7. todo:考虑不同sensor在60秒之间值变化的关联
8. todo:考虑subject之间的差异


# 2.1 观察数据

In [None]:
df_train = pd.read_csv(BASE_PATH + INPUT_PATH + TRAIN_NAME)
df_train.head()

In [None]:
print("数据集："+str(len(df_train))+"，subject列："+str(len(df_train["subject"]))
      +"，去重subject列："+str(len(df_train['subject'].drop_duplicates()))
      +"，去重sequence列："+str(len(df_train['sequence'].drop_duplicates())))

In [None]:
print(df_train.apply(lambda x:"max:"+str(round(max(x),2))+",min:"+str(round(min(x),2))
                     +",mean:"+str(round(x.mean(),5)),axis=0).to_numpy)

In [None]:
np.mean(df_train["sensor_01"].to_numpy())

In [None]:
df_train.describe()

# 2.2 聚合处理数据

In [None]:
#先测试下面的自定义function
col_org = ["sensor_0"+str(col) for col in range(0,10)]+["sensor_"+str(col) for col in range(10,13)]
x_static1 = df_train[col_org].apply([np.max,np.min,np.mean,np.median,np.std],axis=0)
x_static2 = df_train[col_org].quantile(0.25)
x_static3 = df_train[col_org].quantile(0.75)
df = pd.DataFrame()
for col_temp in col_org:
    value_np = x_static1[col_temp].to_numpy()
    df["max_"+col_temp] = [value_np[0]]
    df["min_"+col_temp] = [value_np[1]]
    df["mean_"+col_temp] = [value_np[2]]
    df["median_"+col_temp] = [value_np[3]]
    df["std_"+col_temp] = [value_np[4]]
    df["high_"+col_temp] = [value_np[0]-value_np[1]]
    df["q25_"+col_temp] = x_static2[col_temp]
    df["q75_"+col_temp] = x_static3[col_temp]
    max_value = df_train[col_temp].max()
    min_value = df_train[col_temp].min()
    df["max_sec_"+col_temp] = df_train[df_train[col_temp] == max_value]["step"].to_numpy()[0]
    df["min_sec_"+col_temp] = df_train[df_train[col_temp] == min_value]["step"].to_numpy()[0]
df



In [None]:
col_org = ["sensor_0"+str(col) for col in range(0,10)]+["sensor_"+str(col) for col in range(10,13)]
def compute_feature_step1(x):
    #这里是一个聚合后的子Dataframe对象
    #print(type(x))  
    #print(x[:9])
    #返回嵌套Dataframe
    #return x.loc[x["step"]>50][["sequence","step"]+["sensor_0"+str(col) for col in range(0,5)]]
    #返回非嵌套Dataframe
    #return x[["sequence"]+["sensor_0"+str(col) for col in range(0,5)]]
    #统计后初始化为Dataframe返回
    x_static1 = x[col_org].apply([np.max,np.min,np.mean,np.median,np.std],axis=0)
    x_static2 = x[col_org].quantile(0.25)
    x_static3 = x[col_org].quantile(0.75)
    df = pd.DataFrame()
    for col_temp in col_org:
        value_np = x_static1[col_temp].to_numpy()
        df["max_"+col_temp] = [value_np[0]]
        df["min_"+col_temp] = [value_np[1]]
        df["mean_"+col_temp] = [value_np[2]]
        df["median_"+col_temp] = [value_np[3]]
        df["std_"+col_temp] = [value_np[4]]
        df["high_"+col_temp] = [value_np[0]-value_np[1]]
        df["q25_"+col_temp] = x_static2[col_temp]
        df["q75_"+col_temp] = x_static3[col_temp]
        max_value = x[col_temp].max()
        min_value = x[col_temp].min()
        df["max_sec_"+col_temp] = x[x[col_temp] == max_value]["step"].to_numpy()[0]
        df["min_sec_"+col_temp] = x[x[col_temp] == min_value]["step"].to_numpy()[0]
    return df


In [None]:
#处理训练集  约20分钟 约30分钟
df_train_step1 = df_train.groupby("sequence").apply(compute_feature_step1)
df_train_step1.to_csv("df_train_step1.csv",index = None)
df_train_step1.shape

In [None]:
#处理测试集  约10分钟 约15分钟
df_test = pd.read_csv(BASE_PATH + INPUT_PATH + TEST_NAME)
df_test_step1 = df_test.groupby("sequence").apply(compute_feature_step1)
df_test_step1.to_csv("df_test_step1.csv",index = None)
df_test_step1.shape

In [None]:
#关联训练集和训练集结果  
df_train_result = pd.read_csv(BASE_PATH + INPUT_PATH + TRAIN_RESULTS_NAME)
df_train_result = pd.merge(df_train_step1,df_train_result,on = ["sequence"],how="inner").reset_index()
df_train_result.head()

# 3.训练模型

# 3.1.简单使用LGBM

In [None]:
# 简单LightGBM
data = df_train_result.drop(columns = ['sequence','state','index']).to_numpy()
target = df_train_result[['state']].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

#gbm = lgb.LGBMRegressor(objective='regression', num_leaves=50, learning_rate=0.05, n_estimators=30)
gbm = lgb.LGBMClassifier(objective='binary',num_leaves=110,learning_rate=0.1,n_estimators=125)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='l1')

# gbm = lgb.LGBMClassifier(objective='binary',num_leaves=120,learning_rate=0.1,n_estimators=90)
# valid_0's l1: 0.229509	valid_0's binary_logloss: 0.365004

# 3.2.使用详细调参的LGBM

In [None]:
#详细参数调优
paras = {
        'boosting_type': 'gbdt',  # 设置boosting类型
        'objective': 'binary',  # 目标函数，regression/binary
        #'num_leaves': 31,   # 叶子节点数
        'learning_rate': 0.1,  # 学习速率
        'max_depth': 5,  # 每棵树最大深度
        'random_state': 2022,
        'bagging_fraction': 0.8, # 建树的样本采样比例
        'feature_fraction': 0.8, # 建树的特征选择比例
        #'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
        'metric': 'auc'  # 评估函数 ，可以多个{'l2', 'auc'}
        #'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
        }

In [None]:
data = df_train_result.drop(columns = ['sequence','state','index'])
target = df_train_result[['state']]
#train_test_split可以直接对pandas操作
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)  

data_train = lgb.Dataset(X_train, label=y_train)
data_val = lgb.Dataset(X_test, label=y_test)
lgb_results = {}  
model = lgb.train(
        params = paras,
        train_set = data_train,
        valid_sets = [data_val ,data_train],
        valid_names=['eval', 'train'],
        num_boost_round = 1000,
      #  valid_sets = watchlist,
        early_stopping_rounds=50,
        evals_result=lgb_results,
        verbose_eval=100
    )

# 3.3.使用KFOLD+LGBM（参数复用上面）

In [None]:
print(df_train_result.columns)
x_train = df_train_result.drop(columns = ['sequence','state','index'])
y_train = df_train_result[['state']]

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state = 70)
models = []
for fold, (tr_idx, va_idx) in enumerate(kf.split(x_train)):
    print(f'--------fold:{fold+1}--------')
    fold+=1
    tr_x, va_x = x_train.iloc[tr_idx], x_train.iloc[va_idx]
    tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    data_train_temp = lgb.Dataset(tr_x, tr_y)
    data_val_temp = lgb.Dataset(va_x, va_y)
    lgb_results_temp = {}   
    model_temp = lgb.train(
        params = paras,
        train_set = data_train_temp,
        valid_sets = [data_val_temp ,data_train_temp],
        valid_names=['eval', 'train'],
        num_boost_round = 1000,
      #  valid_sets = watchlist,
        early_stopping_rounds=50,
        evals_result=lgb_results_temp,
        verbose_eval=100
    )
    models.append(model_temp)

# 4.评估模型

# 5.提交

# 5.1.简单使用LGBM

In [None]:
df_test_temp = df_test_step1.copy()
X_pred = df_test_temp.to_numpy()
X_pred

Y_pred = gbm.predict(X_pred, num_iteration=gbm.best_iteration_)

df_test_temp["state"] = Y_pred
df_pred1 = df_test_temp.copy().reset_index()[["sequence","state"]]

df_sample_submission = pd.read_csv(BASE_PATH + INPUT_PATH + SAMPLE_SUBMISSION)

df_submission_1 = pd.merge(df_sample_submission,df_pred1,on = ["sequence"],how="inner").reset_index()
df_submission_1 = df_submission_1[["sequence","state_y"]].rename(columns={"state_y":"state"})

#df_submission_1.to_csv("submission.csv", index=False)

# 5.2.使用详细调参的LGBM

In [None]:
x_test = df_test_step1.copy()
prediction = np.zeros(x_test.shape[0])
prediction = model.predict(x_test)
df_for_submit_2 = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv')
df_for_submit_2["state"] = prediction
df_for_submit_2.to_csv('submission.csv', index=False)
df_for_submit_2

# 5.3.使用KFOLD+LGBM

In [None]:
x_test = df_test_step1.copy()
x_test.head()

In [None]:
prediction = np.zeros(x_test.shape[0])
for i,model in enumerate(models):
    pred = model.predict(x_test) 
    prediction += pred
prediction = prediction/len(models)

In [None]:
df_for_submit_3 = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv')
df_for_submit_3["state"] = prediction

## df_for_submit_3["state_pre"] = lgb_prediction
## df_for_submit_3.loc[df_for_submit["state_pre"]>0.5,"state"] = 1
## df_for_submit_3.drop(columns="state_pre",inplace=True)


#df_for_submit_3.to_csv('submission.csv', index=False)
#df_for_submit_3