Based on kernel - see it : https://www.kaggle.com/caesarlupum/ashrae-ligthgbm-simple-fe

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')
import gc


# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import matplotlib.patches as patches
from scipy import stats
from scipy.stats import skew

from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('max_columns', 100)

py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

import os,random, math, psutil, pickle
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

In [2]:
root = '../../../data/ashrae-energy-prediction/'
#root = '../input/ashrae-energy-prediction/'
df_train_dtypes = {'building_id': np.uint16, 'meter': np.uint8, 'meter_reading': np.float32}
df_test_dtypes = {'building_id': np.uint16, 'meter': np.uint8}
df_building_metadata_dtypes = {'site_id': np.uint8, 'building_id': np.uint16, 'square_feet': np.int32, 'year_built': np.float32, 'floor_count': np.float32}
df_weather_dtypes = {'site_id': np.uint8, 'air_temperature': np.float32, 'cloud_coverage': np.float32, 'dew_temperature': np.float32,
                     'precip_depth_1_hr': np.float32, 'sea_level_pressure': np.float32, 'wind_direction': np.float32, 'wind_speed': np.float32}

df_train = pd.read_csv(root+'train.csv', dtype=df_train_dtypes)
df_test = pd.read_csv(root+'test.csv', dtype=df_train_dtypes)
df_building_metadata = pd.read_csv(root+'building_metadata.csv', dtype=df_building_metadata_dtypes)
df_weather_train = pd.read_csv(root+'weather_train.csv', dtype=df_weather_dtypes)
df_weather_test = pd.read_csv(root+'weather_test.csv', dtype=df_weather_dtypes)

df_test.drop(columns=['row_id'], inplace=True)

df_train = df_train.merge(df_building_metadata, on='building_id', how='left')
df_train = df_train.merge(df_weather_train, on=['site_id', 'timestamp'], how='left')
df_test = df_test.merge(df_building_metadata, on='building_id', how='left')
df_test = df_test.merge(df_weather_test, on=['site_id', 'timestamp'], how='left')

del df_building_metadata, df_weather_train, df_weather_test
gc.collect()

print('Training Set Shape = {}'.format(df_train.shape))
print('Test Set Shape = {}'.format(df_test.shape))
print('Training Set Memory Usage = {:.2f} MB'.format(df_train.memory_usage().sum() / 1024**2))
print('Test Set Memory Usage = {:.2f} MB'.format(df_test.memory_usage().sum() / 1024**2))

Training Set Shape = (20216100, 16)
Test Set Shape = (41697600, 15)
Training Set Memory Usage = 1388.13 MB
Test Set Memory Usage = 2704.08 MB


In [3]:
sample_submission = pd.read_csv(root + 'sample_submission.csv')

In [4]:
df_train["timestamp"] = pd.to_datetime(df_train["timestamp"], format='%Y-%m-%d %H:%M:%S')

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
## REducing memory
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 1041.10 Mb (25.0% reduction)
Mem. usage decreased to 1988.30 Mb (26.5% reduction)


In [7]:
df_train['floor_count'] = df_train['floor_count'].fillna(-999).astype(np.int16)
df_test['floor_count'] = df_test['floor_count'].fillna(-999).astype(np.int16)

df_train['year_built'] = df_train['year_built'].fillna(-999).astype(np.int16)
df_test['year_built'] = df_test['year_built'].fillna(-999).astype(np.int16)

df_train['cloud_coverage'] = df_train['cloud_coverage'].fillna(-999).astype(np.int16)
df_test['cloud_coverage'] = df_test['cloud_coverage'].fillna(-999).astype(np.int16) 

In [8]:
categoricals = ["site_id", "building_id", "primary_use",  "meter",  "wind_direction"] #"hour", "weekday",
drop_cols = ["sea_level_pressure", "wind_speed"]

numericals = ["square_feet", "year_built", "air_temperature", "cloud_coverage",
              "dew_temperature", 'precip_depth_1_hr', 'floor_count']

feat_cols = categoricals + numericals

In [None]:
target = np.log1p(df_train["meter_reading"])

del df_train["meter_reading"] 

df_train = df_train.drop(drop_cols, axis = 1)

In [None]:
le = LabelEncoder()
df_train['primary_use'] = le.fit_transform(df_train['primary_use']).astype(np.int8)
df_test['primary_use'] = le.fit_transform(df_test['primary_use']).astype(np.int8)

In [None]:
params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'subsample_freq': 1,
            'learning_rate': 0.3,
            'bagging_freq': 5,
            'num_leaves': 330,
            'feature_fraction': 0.9,
            'lambda_l1': 1,  
            'lambda_l2': 1
            }

folds = 5
seed = 666
shuffle = False
kf = KFold(n_splits=folds, shuffle=shuffle, random_state=seed)

models = []
for train_index, val_index in kf.split(df_train[feat_cols], df_train['building_id']):
    train_X = df_train[feat_cols].iloc[train_index]
    val_X = df_train[feat_cols].iloc[val_index]
    train_y = target.iloc[train_index]
    val_y = target.iloc[val_index]
    lgb_train = lgb.Dataset(train_X, train_y, categorical_feature=categoricals)
    lgb_eval = lgb.Dataset(val_X, val_y, categorical_feature=categoricals)
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=(lgb_train, lgb_eval),
                early_stopping_rounds=50,
                verbose_eval = 50)
    print(train_index)
    models.append(gbm)


Using categorical_feature in Dataset.

