In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
categoricals = ["site_id", "building_id", "primary_use", "hour", "weekday", "meter",  "wind_direction"]
numericals = ["square_feet", "year_built", "air_temperature", "cloud_coverage",
              "dew_temperature", 'precip_depth_1_hr', 'floor_count', 'beaufort_scale']

feat_cols = categoricals + numericals

# Carregamento dos dados de treinamento
Utiliza arquivos pickles criados em versão anterior do notebook para acelerar tempo de carregamento

In [None]:
train = pd.read_pickle('/kaggle/input/ashrae-pickle/train.pickle')
target = pd.read_pickle('/kaggle/input/ashrae-pickle/target.pickle')

# Treinamento do modelo (não precisa rodar)
Utiliza K-Fold para Cross Validation, treinando 4 modelos diferentes

Este passo é opcional, visto que já foi executado, e o modelo salvo nos arquivos lgb_i.txt (ver Seção Carregamento do modelo salvo)

In [None]:
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm


params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'subsample': 0.25,
            'subsample_freq': 1,
            'learning_rate': 0.4,
            'num_leaves': 20,
            'feature_fraction': 0.9,
            'lambda_l1': 1,  
            'lambda_l2': 1
            }

folds = 4
seed = 666

kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

models = []
for train_index, val_index in kf.split(train, train['building_id']):
    train_X = train[feat_cols].iloc[train_index]
    val_X = train[feat_cols].iloc[val_index]
    train_y = target.iloc[train_index]
    val_y = target.iloc[val_index]
    lgb_train = lgb.Dataset(train_X, train_y, categorical_feature=categoricals)
    lgb_eval = lgb.Dataset(val_X, val_y, categorical_feature=categoricals)
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=(lgb_train, lgb_eval),
                early_stopping_rounds=100,
                verbose_eval = 100)
    models.append(gbm)

# Salva modelo treinado
for i in range(len(models)):
    models[i].save_model('lgb_{}.txt'.format(i), num_iteration=models[i].best_iteration)

# Carregamento dos dados de teste (não precisa rodar)
A fazer: criar arquivo pickle para acelerar tempo de carregamento

Este passo é opcional, apenas caso queira enviar submissão para a competição, visto que os dados de teste não possuem os valores do target

In [None]:
import gc
del train, train_X, val_X, lgb_train, lgb_eval, train_y, val_y, target
gc.collect()

In [None]:
def degToCompass(num):
    val=int((num/22.5)+.5)
    arr=[i for i in range(0,16)]
    return arr[(val % 16)]

def average_imputation(df, column_name):
    imputation = df.groupby(['timestamp'])[column_name].mean()
    
    df.loc[df[column_name].isnull(), column_name] = df[df[column_name].isnull()][[column_name]].apply(lambda x: imputation[df['timestamp'][x.index]].values)
    del imputation
    return df

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
building_df = pd.read_csv("../input/ashrae-energy-prediction/building_metadata.csv")
test = pd.read_csv("../input/ashrae-energy-prediction/test.csv")
test = test.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
del building_df
gc.collect()
test["primary_use"] = le.fit_transform(test["primary_use"])  # TODO: talvez de merda

weather_test = pd.read_csv("../input/ashrae-energy-prediction/weather_test.csv")

test = test.merge(weather_test, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"], how = "left")
del weather_test
gc.collect()

In [None]:
test["timestamp"] = pd.to_datetime(test["timestamp"])
test["hour"] = test["timestamp"].dt.hour
test["weekday"] = test["timestamp"].dt.weekday
test["weekday"] = test['weekday'].astype(np.uint8)
test["hour"] = test['hour'].astype(np.uint8)
test['year_built'] = test['year_built']-1900
test['square_feet'] = np.log(test['square_feet'])

test = average_imputation(test, 'wind_speed')
test = average_imputation(test, 'wind_direction')

beaufort = [(0, 0, 0.3), (1, 0.3, 1.6), (2, 1.6, 3.4), (3, 3.4, 5.5), (4, 5.5, 8), (5, 8, 10.8), (6, 10.8, 13.9), 
          (7, 13.9, 17.2), (8, 17.2, 20.8), (9, 20.8, 24.5), (10, 24.5, 28.5), (11, 28.5, 33), (12, 33, 200)]

for item in beaufort:
    test.loc[(test['wind_speed']>=item[1]) & (test['wind_speed']<item[2]), 'beaufort_scale'] = item[0]
test['wind_direction'] = test['wind_direction'].apply(degToCompass)

test['wind_direction'] = test['wind_direction'].apply(degToCompass)
test['beaufort_scale'] = test['beaufort_scale'].astype(np.uint8)
test["wind_direction"] = test['wind_direction'].astype(np.uint8)
test["meter"] = test['meter'].astype(np.uint8)
test["site_id"] = test['site_id'].astype(np.uint8)

test = test[feat_cols]

In [None]:
# Faz predicao na base de teste
from tqdm import tqdm

folds = 4
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    res.append(np.expm1(sum([model.predict(test.iloc[i:i+step_size]) for model in models])/folds))
    i+=step_size

In [None]:
# Submissao do resultado do teste
res = np.concatenate(res)
submission = pd.read_csv('/kaggle/input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv('submission.csv', index=False)
submission

# Carregamento do modelo salvo
Carrega modelo pré-treinado

In [None]:
import lightgbm as lgb
models = []
for i in range(4):
    model = lgb.Booster(model_file='/kaggle/input/ashrae-pickle/lgb_{}.txt'.format(i))
    models.append(model)

In [None]:
# Visualizacao do Feature Importance
import seaborn as sns
df_fimp = pd.DataFrame()
for i in range(4):
    df_cv = pd.DataFrame()
    df_cv["feature"] = feat_cols
    df_cv["importance"] = models[i].feature_importance()
    df_cv["cv"] = i
    df_fimp = pd.concat([df_fimp, df_cv], axis=0)

sns.barplot(x="importance", y="feature", data=df_fimp.sort_values(by="importance", ascending=False))

In [None]:
del test
train = pd.read_pickle('/kaggle/input/ashrae-pickle/train.pickle')
target = pd.read_pickle('/kaggle/input/ashrae-pickle/target.pickle')

In [None]:
predicted = []
for i in range(100, 200):
    train_X = train[feat_cols].iloc[i]
    prediction = sum([model.predict(train_X) for model in models])/4
    predicted.append(prediction)

In [None]:
from matplotlib import pyplot as plt

plt.plot(train.iloc[100:200].index, target.iloc[100:200])
plt.plot(train.iloc[100:200].index, predicted)

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

folds = 4
seed = 666

kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
i = 0
kf_split = kf.split(train, train['building_id'])
train_index, val_index = kf_split[0]
train_X = train[feat_cols].iloc[train_index]
val_y = target.iloc[val_index]

In [None]:
start = 230
size = 50
prediction = models[i].predict(train_X.iloc[start:start+size])
plt.plot(val_y.iloc[start:start+size].values)
plt.plot(prediction)
plt.legend(['Real', 'Predição'])