In [11]:
#https://www.kaggle.com/vrtjso/lgbm-one-step-ahead
"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from scipy import sparse as ssp

In [12]:
df_train = pd.read_csv(
    '../input/train.csv', usecols=[1, 2, 3, 4, 5],#跳过id信息（与index相同）
    dtype={'onpromotion': bool},  #on promotion:被推广、宣传
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0}, #对销量做log(1+p)操作
    parse_dates=['date'], #转换日期
    skiprows=range(1, 66458909)  # 只取2016-01-01至2017-08-15的记录（训练集从2013-01-01开始）
)

df_test = pd.read_csv(
    '../input/test.csv', usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=['date']  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date'] #设置索引
)

items = pd.read_csv(
    '../input/items.csv',
).set_index('item_nbr')

df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)] #只取了2017年的记录
del df_train

In [13]:
#onpromotion信息dataframe
#多级索引，按照最内层的索引date展开（date变成columns）
promo_2017_train = df_2017.set_index(
    ['store_nbr', 'item_nbr', 'date'])[['onpromotion']].unstack(
        level=-1).fillna(False)#使用False对那些没有onpromotion数据的记录进行填充
#columns是个多维index，第0层全是'onpromotion'，第1层是timestamp
#去除columns中onpromotion部分的索引
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
#df_test之前已经设置过index了
promo_2017_test = df_test[['onpromotion']].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [14]:
#去除掉没有训练数据的记录（210654->167515）
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [15]:
#创建销售量表格
df_2017 = df_2017.set_index(
    ['store_nbr', 'item_nbr', 'date'])[['unit_sales']].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

#删掉训练集中没有的item
items = items.reindex(df_2017.index.get_level_values(1))

In [16]:
#筛选列：minus：自dt前minus天开始计算；periods：要选的天数；freq:'7D'每7天提取一次
#date_range第一个参数：开始日期
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def timeSeriesProcess(df_2017, t2017, label):
    X = pd.DataFrame()
    duration_list = [1, 3, 7, 14, 28, 60, 140]
    for d in duration_list:
        timespan = get_timespan(df_2017, t2017, d, d)
        X['{}_day_{}_2017'.format(label, d)] = timespan.mean(axis=1).values
        if d != 1:
            X['{}_day_{}_2017_max'.format(label,d)] = timespan.max(axis=1).values
            X['{}_day_{}_2017_min'.format(label,d)] = timespan.min(axis=1).values
            X['{}_day_{}_2017_var'.format(label,d)] = timespan.var(axis=1).values
            X['{}_day_{}_2017_skew'.format(label,d)] = timespan.skew(axis=1).values#样本值的偏度（三阶矩）
            X['{}_day_{}_2017_kurt'.format(label,d)] = timespan.kurt(axis=1).values#样本值的峰度（四阶矩） 
            
            exp_sum = np.zeros(timespan.shape[0])
            for i in range(timespan.shape[1]):
                exp_sum += np.exp(-i/5) * timespan.iloc[:,i]
            X['{}_exp_moving_sum_{}'.format(label,d)] = exp_sum.values
    
    for idx in range(1,len(duration_list)):#一阶差分特征
        a = duration_list[idx-1]
        b = duration_list[idx]
        X['{}_day_{}sub{}_2017'.format(label, a,b)] = X['{}_day_{}_2017'.format(label, a)] \
                                                    - X['{}_day_{}_2017'.format(label, b)]
        
    for i in range(7):#第i个曜日
        for j in [4, 10, 20]:
            timespan = get_timespan(df_2017, t2017, j*7-i, j, freq='7D')
            #前j周每个曜日里购买数的均值
            X['{}_mean_{}_dow{}_2017'.format(label, j, i)] = timespan.mean(axis=1).values
            
        date = t2017-timedelta(7-i)#获得i天后的前一周对应的同曜日
#         ahead = 7-i
#         #前7-i日的平均
#         X['{}_ahead0_{}'.format(label, i)] = get_timespan(df_2017, date, 0, ahead).mean(axis=1).values
# #         #前14-i日的平均
#         X['{}_ahead7_{}'.format(label, i)] = get_timespan(df_2017, date, 7, ahead+7).mean(axis=1).values
# #         #上周同曜日前一天的值
#         X['{}_day_1_2017_{}_1'.format(label, i)]= get_timespan(df_2017, date, 1, 1).values.ravel()
# #         #上上周同曜日前一天的值
#         X['{}_day_1_2017_{}_2'.format(label, i)]= get_timespan(df_2017, date-timedelta(7), 1, 1).values.ravel()
        for m in [3,7,14,28,60,130]:
            X['{}_mean_{}_2017_{}_1'.format(label, m,i)]= get_timespan(df_2017, date, m, m).mean(axis=1).values
            X['{}_mean_{}_2017_{}_2'.format(label, m,i)]= get_timespan(
                df_2017, date-timedelta(7), m, m).mean(axis=1).values
    return X
    
#训练集df_2017, 训练集起始时间t2017， 推广信息promo_2017
def prepare_dataset(df_2017, t2017, promo_2017, store_2017, is_train=True):
    X = pd.DataFrame({
        'store_nbr':df_2017.index.get_level_values(0),
        'item_nbr':df_2017.index.get_level_values(1),
        #从t2017开始算的16天内，未促销的总天数
        'unpromo_16aftsum_2017':(1-get_timespan(promo_2017, t2017, 0, 16)).sum(axis=1).values
    })
    duration_list = [1, 3, 7, 14, 28, 60, 140]
    for d in duration_list:       
        X['promo_{}_2017'.format(d)] = get_timespan(promo_2017, t2017, d, d).sum(axis=1).values
    
    #测试集为17.8.16-17.8.31（16天）
    for i in range(16):#训练集起始时间t2017及后15天中每一天是否有推广
        X['promo_{}'.format(i)] = promo_2017[t2017 + timedelta(days=i)].values.astype(np.uint8)
        #组合特征：后第i天是否促销×前j天的促销总数
#         for j in [7,14,60,140]:#前j天的促销总数
#             X['aft_promo_{}_{}'.format(i,j)] = X['promo_{}'.format(i)] * X['promo_{}_2017'.format(j)]
    
    X = pd.concat([X, timeSeriesProcess(df_2017, t2017, 'item')], axis=1)

      # 商店信息
#     X_shop = storeProcess(store_2017, t2017, 'store')
#     X = pd.merge(X, X_shop, on='store_nbr', how='left')

    if is_train:#返回后16天的真实数据
        y = df_2017[pd.date_range(t2017, periods=16)].values
        return X, y
    return X

In [17]:
num_training_weeks = 8
print('Preparing dataset...')
t2017 = date(2017, 5, 31) #起始时间
X_l, y_l = [], []
#训练集滑窗：5月31日~6月15日第一个滑窗，6月7日~6月22日第二个滑窗...
for i in range(num_training_weeks):#num_training_weeks分了个滑窗(汇总到X_l中)
    print('training set ' + str(i) + ':')
    delta = timedelta(days = 7 * i)  #时间间隔
    X_tmp, y_tmp = prepare_dataset(
        df_2017, t2017 + delta, promo_2017, store_2017
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0, ignore_index=True)
y_train = np.concatenate(y_l, axis=0)  #之后16天的真实值
del X_l, y_l

#更换验证集为7月26日-8月10日（16天）
print('validation set:')
X_val, y_val = prepare_dataset(df_2017, date(2017, 7, 26), promo_2017, store_2017)
print('testing set:')
X_test = prepare_dataset(df_2017, date(2017, 8, 16), promo_2017, store_2017, is_train=False)

Preparing dataset...
training set 0:
training set 1:
training set 2:
training set 3:
training set 4:
training set 5:
training set 6:
training set 7:
validation set:
testing set:


In [18]:
sjh_df = pd.read_csv('../feature/sjh.csv')
sjh_df.loc[sjh_df['cnt']<0, 'cnt'] = np.nan
X_train['sjh'] = sjh_df['cnt']
X_val['sjh'] = sjh_df['cnt']
X_test['sjh'] = sjh_df['cnt']

In [25]:
del X_train['item_nbr']
del X_val['item_nbr']
del X_test['item_nbr']

In [26]:
del X_train['store_nbr']
del X_val['store_nbr']
del X_test['store_nbr']

In [282]:
#item信息
le = LabelEncoder()
items['family'] = le.fit_transform(items['family'])
X_train['family'] = pd.concat([items['family']] * num_training_weeks).values
X_val['family'] = items['family'].values
X_test['family'] = items['family'].values
X_train['class'] = pd.concat([items['class']] * num_training_weeks).values
X_val['class'] = items['class'].values
X_test['class'] = items['class'].values

In [283]:
#店铺分类信息
store_info = pd.read_csv('../input/stores.csv', usecols=[0, 3, 4])
X_train = pd.merge(X_train, store_info, on='store_nbr', how='left')
X_val = pd.merge(X_val, store_info, on='store_nbr', how='left')
X_test = pd.merge(X_test, store_info, on='store_nbr', how='left')

cat_features = ['store_nbr','type','cluster','family','class']
num_features = [i for i in X_train.columns if i not in cat_features]
for col in cat_features:
    le = LabelEncoder()
    le.fit(pd.concat([X_train[col].drop_duplicates(), X_val[col].drop_duplicates(), X_test[col].drop_duplicates()]))
    X_train[col] = le.transform(X_train[col])
    X_val[col] = le.transform(X_val[col])
    X_test[col] = le.transform(X_test[col])
    
enc = OneHotEncoder()
enc.fit(pd.concat([X_train[cat_features],X_val[cat_features],X_test[cat_features]]))
X_train_cat = enc.transform(X_train[cat_features])
X_val_cat = enc.transform(X_val[cat_features])
X_test_cat = enc.transform(X_test[cat_features])

cat_count_features = []
for col in cat_features:
    d = pd.concat([X_train[col],X_val[col],X_test[col]]).value_counts().to_dict()
    X_train['%s_count'%col] = X_train[col].apply(lambda x:d.get(x,0))
    X_val['%s_count'%col] = X_val[col].apply(lambda x:d.get(x,0))   
    X_test['%s_count'%col] = X_test[col].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%col)

X_train = ssp.hstack([X_train[num_features+cat_count_features].values,X_train_cat,]).tocsr()
X_val = ssp.hstack([X_val[num_features+cat_count_features].values,X_val_cat,]).tocsr()
X_test = ssp.hstack([X_test[num_features+cat_count_features].values,X_test_cat,]).tocsr()

In [293]:
import pickle
pickle.dump([X_train, y_train], open("../feature/train.pk", 'wb'))
pickle.dump([X_val, y_val], open("../feature/val.pk", 'wb'))
pickle.dump(X_test, open("../feature/test.pk", 'wb'))

In [298]:
X_train, y_train = pickle.load(open("../feature/train.pk",'rb'))
X_train.shape, y_train.shape

((1340120, 626), (1340120, 16))

In [27]:
# t = [  
# ]
# X_train.drop(t, axis=1, inplace=True)
# X_val.drop(t, axis=1, inplace=True)
# X_test.drop(t, axis=1, inplace=True)
np.array(X_train.columns), X_train.shape[1],X_val.shape[1],X_test.shape[1]

(array(['unpromo_16aftsum_2017', 'promo_1_2017', 'promo_3_2017',
        'promo_7_2017', 'promo_14_2017', 'promo_28_2017', 'promo_60_2017',
        'promo_140_2017', 'promo_0', 'promo_1', 'promo_2', 'promo_3',
        'promo_4', 'promo_5', 'promo_6', 'promo_7', 'promo_8', 'promo_9',
        'promo_10', 'promo_11', 'promo_12', 'promo_13', 'promo_14',
        'promo_15', 'item_day_1_2017', 'item_day_3_2017',
        'item_day_3_2017_max', 'item_day_3_2017_min', 'item_day_3_2017_var',
        'item_day_3_2017_skew', 'item_day_3_2017_kurt',
        'item_exp_moving_sum_3', 'item_day_7_2017', 'item_day_7_2017_max',
        'item_day_7_2017_min', 'item_day_7_2017_var',
        'item_day_7_2017_skew', 'item_day_7_2017_kurt',
        'item_exp_moving_sum_7', 'item_day_14_2017', 'item_day_14_2017_max',
        'item_day_14_2017_min', 'item_day_14_2017_var',
        'item_day_14_2017_skew', 'item_day_14_2017_kurt',
        'item_exp_moving_sum_14', 'item_day_28_2017',
        'item_day_28_2017_m

In [28]:
params = {
'num_leaves': 33,'objective': 'regression','min_data_in_leaf': 1500,
'learning_rate': 0.1,'feature_fraction': 0.7,'min_split_gain': 0,
'metric': 'l2','subsample': 0.9,'drop_rate': 0.1,'min_child_samples': 10,
'min_child_weight': 150,'max_drop': 50,'boosting':'gbdt'
}

MAX_ROUNDS = 10000
i=0
print('Step %d' % (i+1))
dtrain = lgb.Dataset(
    X_train, label=y_train[:, i],
    weight=pd.concat([items['perishable']] * num_training_weeks) * 0.25 + 1 
)
dval = lgb.Dataset(
    X_val, label=y_val[:, i], reference=dtrain,
    weight=items['perishable'] * 0.25 + 1
)
bst = lgb.train(
    params, dtrain, num_boost_round=MAX_ROUNDS,
    valid_sets=dval, early_stopping_rounds=100, verbose_eval=100
)
# 1:不考虑cat特征：0.286587(178), 0.286753(264), 0.286683(270)

Step 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.298849
[200]	valid_0's l2: 0.296806
[300]	valid_0's l2: 0.296227
[400]	valid_0's l2: 0.295871


KeyboardInterrupt: 

In [69]:
import operator
import matplotlib.pyplot as plt
%matplotlib inline
names = bst.feature_name()
values = bst.feature_importance()
importance = {}
for i in range(len(names)):
    importance[names[i]] = values[i]
importance = sorted(importance.items(), key=operator.itemgetter(1))  

df = pd.DataFrame(importance, columns=['feature', 'fscore'])  
df['fscore'] = df['fscore'] / df['fscore'].sum()
# plt.figure()
# df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(12, 17))  
# plt.title('LightGBM Feature Importance')
# plt.xlabel('relative importance')
# plt.show()

In [70]:
df.loc[df['fscore']<=0.00005,'feature'].values

array(['item_day_3_2017_kurt', 'aft_promo_5_14', 'promo_8', 'promo_12'], dtype=object)

In [284]:
print('Training and predicting models...')
params = {
    'num_leaves': 33,
    'objective': 'regression',
    'min_data_in_leaf': 1500,
    'learning_rate': 0.02,
    'feature_fraction': 0.7,
    'min_split_gain': 0,
    'metric': 'l2',
    'subsample': 0.9,
    'drop_rate': 0.1,
    'min_child_samples': 10,
    'min_child_weight': 150,
    'max_drop': 50,
    'boosting':'gbdt'
}

MAX_ROUNDS = 10000
val_pred = []
test_pred = []
seed_list = [1, 3, 5, 7]
for i in range(16):#训练时，16天的label不同（分为16个模型）
    print('Step %d' % (i+1))
    val_res = np.zeros(X_val.shape[0])
    test_res = np.zeros(X_test.shape[0])
    dtrain = lgb.Dataset(X_train, label=y_train[:, i],
        weight=pd.concat([items['perishable']] * num_training_weeks) * 0.25 + 1 
    )
    dval = lgb.Dataset(X_val, label=y_val[:, i], reference=dtrain,
        weight=items['perishable'] * 0.25 + 1
    )
    
    for seed in seed_list:
        print('seed: %d' % seed)
        params['seed'] = seed
        bst = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS,
            valid_sets=dval, early_stopping_rounds=100, verbose_eval=100
        )
        val_res += bst.predict(X_val, num_iteration=bst.best_iteration) / len(seed_list)
        test_res += bst.predict(X_test, num_iteration=bst.best_iteration) / len(seed_list)

    val_pred.append(val_res)
    test_pred.append(test_res)

print('old Validation mse:', mean_squared_error(y_val, np.array(val_pred).transpose()))
print('Validation mse:', mean_squared_error(y_val, np.array(val_pred).transpose(),
                                            sample_weight=items['perishable'] * 0.25 + 1))
# 0.1:0.285485,0.319699,0.333899,0.344843, 0.344057, 0.354038

Training and predicting models...
Step 1
seed: 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.31685
[200]	valid_0's l2: 0.293052
[300]	valid_0's l2: 0.2888
[400]	valid_0's l2: 0.287346
[500]	valid_0's l2: 0.286554
[600]	valid_0's l2: 0.285994
[700]	valid_0's l2: 0.285593
[800]	valid_0's l2: 0.285259
[900]	valid_0's l2: 0.284982
[1000]	valid_0's l2: 0.284744
[1100]	valid_0's l2: 0.284568
[1200]	valid_0's l2: 0.28439
[1300]	valid_0's l2: 0.284247
[1400]	valid_0's l2: 0.284112
[1500]	valid_0's l2: 0.28402
[1600]	valid_0's l2: 0.2839
[1700]	valid_0's l2: 0.28381
[1800]	valid_0's l2: 0.283707
[1900]	valid_0's l2: 0.283623
[2000]	valid_0's l2: 0.28357
[2100]	valid_0's l2: 0.28351
[2200]	valid_0's l2: 0.283439
[2300]	valid_0's l2: 0.283371
[2400]	valid_0's l2: 0.283304
[2500]	valid_0's l2: 0.283259
[2600]	valid_0's l2: 0.283213
[2700]	valid_0's l2: 0.283161
[2800]	valid_0's l2: 0.283112
[2900]	valid_0's l2: 0.283061
[3000]	valid_0's l2: 0.283038
[3100]	

In [None]:
recover_store_cat:0.355868309398, fix:0.355397273317->0.510
        0.02:0.355059895247, fix:0.354562963844->0.510
item_exp:0.1:0.355630276039, fix:0.355160512964->0.510
item_exp_few:0.1:0.355803713767, fix:0.35532155446->0.511
item_exp_few_3fold: 0.354763443602,0.3542717483->0.510
item_cat:0.351795385334,0.35128192006->0.509
item_cat_4fold_0.02:0.351661289325,0.351142671762

In [None]:
baseline: 0.362437206173->0.515
param_fix:0.361677049762->0.514
params = {
    'num_leaves': 33, 'objective': 'regression', 'min_data_in_leaf': 250, 'learning_rate': 0.02,
    'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 2, 'metric': 'l2',}
expand: 0.359787992849(0.1:0.360486931421)->0.513
1-8fe: (0.1:
        lgbahead,0.358851899136, 
        加入max，min，三阶、四阶矩250维，0.358836353721,实现有bug)->0.512
    0.01:0.357909454736->0.511
        
1-8fe_few：0.1:0.357231820197(65维)->0.512
    fe_few_shop0.01:0.35687875926->0.517
1-9fix:0.357625751045
    （只使用4周）0.357720538403
    （只使用4周,添加item信息，138维）0.355534132174->0.514
    (使用8周)0.1:0.353922676054->0.513
        
恢复：只使用了kurt，未使用store波动信息，0.1：0.358851609814,fix:0.35837375683
# bug修复，使用item信息，0.1：0.35514947207， fix:0.354666562459（0.01:0.35375626153, fix:0.353261266407->0.512）
#     去掉bagging参数：0.1：0.354829531037， 0.354325095234
#     新参数：0.1：0.354090609365，0.353654615852
# params = {'num_leaves': 33,'objective': 'regression','min_data_in_leaf': 1500,'learning_rate': 0.1,
# 'feature_fraction': 0.6,'min_split_gain': 0,'metric': 'l2','subsample': 0.9,'drop_rate': 0.1,
# 'min_child_samples': 10,'min_child_weight': 150,'max_drop': 50,'boosting':'gbdt'}
recover: 0.1:0.358387893938, fix:0.357862016198->0.511
recover_cat:0.356011999393, fix:0.355533224308->0.511
recover_store_cat:0.355868309398, fix:0.355397273317->0.510
        0.02:0.355059895247, fix:0.354562963844->0.510

In [264]:
full_2017 = pd.concat([df_2017.reset_index(drop=True), pd.DataFrame(y_test)], ignore_index=True, axis=1)
full_2017.to_csv('../feature/data.csv.gz',float_format='%.4f', index=None,compression='gzip')

In [290]:
y_test.shape

(167515, 16)

In [285]:
print('Making submission...')
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range('2017-08-16', periods=16)
)
#stack:把列添加到索引中的最内层
df_preds = df_preds.stack().to_frame('unit_sales')
df_preds.index.set_names(['store_nbr', 'item_nbr', 'date'], inplace=True)

submission = df_test[['id']].join(df_preds, how='left').fillna(0)#无训练集的使用0来填充
submission['unit_sales'] = np.clip(np.expm1(submission['unit_sales']), 0, 1000)#把结果限制在0到1000之间
submission.to_csv('../submit/1-14lgb_item_cat_4fold_0.02.csv.gz',
                  float_format='%.4f', index=None,compression = 'gzip')

Making submission...


In [None]:
#有毒的特征：
# X_train['perishable'] = items['perishable']
# X_val['perishable'] = items['perishable']
# X_test['perishable'] = items['perishable']

#易变质*是否促销
# for i in range(16):  
#     X_train['promo_{}*perishable'.format(i)] = X_train['promo_{}'.format(i)] * X_train['perishable']
#     X_val['promo_{}*perishable'.format(i)] = X_val['promo_{}'.format(i)] * X_val['perishable']
#     X_test['promo_{}*perishable'.format(i)] = X_test['promo_{}'.format(i)] * X_test['perishable']

# #item信息
# le = LabelEncoder()
# items['family'] = le.fit_transform(items['family'])
# X_train['family'] = items['family']
# X_val['family'] = items['family']
# X_test['family'] = items['family']
# X_train['class'] = items['class']
# X_val['class'] = items['class']
# X_test['class'] = items['class']

# #同一家店的同一种商品
# X_train['store_nbr_family'] = X_train['store_nbr'].astype(str) + '_' + X_train['family'].astype(str)
# X_val['store_nbr_family'] = X_val['store_nbr'].astype(str) + '_' + X_val['family'].astype(str)
# X_test['store_nbr_family'] = X_test['store_nbr'].astype(str) + '_' + X_test['family'].astype(str)
# X_train['store_nbr_class'] = X_train['store_nbr'].astype(str) + '_' + X_train['class'].astype(str)
# X_val['store_nbr_class'] = X_val['store_nbr'].astype(str) + '_' + X_val['class'].astype(str)
# X_test['store_nbr_class'] = X_test['store_nbr'].astype(str) + '_' + X_test['class'].astype(str)

# cat_list = ['store_nbr_family','store_nbr_class']
# for col in cat_list:
#     le = LabelEncoder()
#     le.fit(pd.concat([X_train[col].drop_duplicates(), X_val[col].drop_duplicates(), X_test[col].drop_duplicates()],
#                      ignore_index=True))
#     X_train[col] = le.transform(X_train[col])
#     X_val[col] = le.transform(X_val[col])
#     X_test[col] = le.transform(X_test[col])
    
# X_train.drop(['family','class'], axis=1, inplace=True)
# X_val.drop(['family','class'], axis=1, inplace=True)
# X_test.drop(['family','class'], axis=1, inplace=True)a