In [None]:
# load packages
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [None]:
# load data
Test = pd.read_csv("/kaggle/input/ashrae-energy-prediction/test.csv")
Weather_train = pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_train.csv")
Train = pd.read_csv("/kaggle/input/ashrae-energy-prediction/train.csv")
Weather_test = pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_test.csv")
Sample_submission = pd.read_csv("/kaggle/input/ashrae-energy-prediction/sample_submission.csv")
Building = pd.read_csv("/kaggle/input/ashrae-energy-prediction/building_metadata.csv")

In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
Test = reduce_mem_usage(Test)
Weather_train = reduce_mem_usage(Weather_train)
Train = reduce_mem_usage(Train)
Weather_test = reduce_mem_usage(Weather_test)
Building = reduce_mem_usage(Building)

In [None]:
# Merge_build_train = Building.merge(Train, left_on='building_id', right_on='building_id')
Merge_build_train = Building.merge(Train, on='building_id', how = "left")

In [None]:
Merge = Merge_build_train.merge(Weather_train, on=["site_id", "timestamp"], how="left")

In [None]:
Merge['meter_reading_log1p'] = np.log1p(Train['meter_reading'])    
Merge.timestamp = pd.to_datetime(Merge.timestamp, format="%Y-%m-%d %H:%M:%S")
Merge["hour"] = Merge.timestamp.dt.hour
Merge["weekday"] = Merge.timestamp.dt.weekday

In [None]:
# Creating datasets for 4 meter readings
Merge_meter0 = Merge[Merge.meter == 0]
Merge_meter1 = Merge[Merge.meter == 1]
Merge_meter2 = Merge[Merge.meter == 2]
Merge_meter3 = Merge[Merge.meter == 3]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
def predict(Merge_meter):
    """
    Preparing final dataset with all features.
    """
    columns = ['site_id','building_id','square_feet', 'year_built',
       'floor_count','air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'wind_speed', 'hour', 'weekday']
    column2 = ['meter_reading_log1p']
    X = Merge_meter[columns].astype(float)
    y = Merge_meter[column2].astype(float)
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    d_train = lgb.Dataset(x_train, label=y_train)
    d_test = lgb.Dataset(x_test, label=y_test)
    params = {}
    params['learning_rate'] = 0.003
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'regression'
    params['metric'] = 'binary_logloss'
    params['sub_feature'] = 0.5
    params['num_leaves'] = 40
    params['min_data'] = 50
    params['max_depth'] = 20
    params["metric"] = "rmse"

    clf = lgb.train(params, train_set=d_train, num_boost_round=600, verbose_eval=200,valid_sets=d_test, early_stopping_rounds=200)

    #Prediction
    y_pred=clf.predict(x_test)

    return clf

In [None]:
clf0 = predict(Merge_meter0)

In [None]:
clf1 = predict(Merge_meter1)

In [None]:
clf2 = predict(Merge_meter2)

In [None]:
clf3 = predict(Merge_meter3)

In [None]:
del Building, Merge_build_train, Merge

In [None]:
del Merge_meter0, Merge_meter1, Merge_meter2, Merge_meter3

In [None]:
Building2 = pd.read_csv("/kaggle/input/ashrae-energy-prediction/building_metadata.csv")
Building2 = reduce_mem_usage(Building2)

In [None]:
Merge_build_test= Building2.merge(Test, on='building_id', how = "left")

In [None]:
Merge2 = Merge_build_test.merge(Weather_test, on=["site_id", "timestamp"], how="left")

In [None]:
Merge2.timestamp = pd.to_datetime(Merge2.timestamp, format="%Y-%m-%d %H:%M:%S")
Merge2["hour"] = Merge2.timestamp.dt.hour
Merge2["weekday"] = Merge2.timestamp.dt.weekday

In [None]:
Merge2_meter0 = Merge2[Merge2.meter == 0]
Merge2_meter1 = Merge2[Merge2.meter == 1]
Merge2_meter2 = Merge2[Merge2.meter == 2]
Merge2_meter3 = Merge2[Merge2.meter == 3]

In [None]:
x_test0 = Merge2_meter0[['site_id','building_id','square_feet', 'year_built',
       'floor_count','air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'wind_speed', 'hour', 'weekday']]
x_test1 = Merge2_meter1[['site_id','building_id','square_feet', 'year_built',
       'floor_count','air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'wind_speed', 'hour', 'weekday']]
x_test2 = Merge2_meter2[['site_id','building_id','square_feet', 'year_built',
       'floor_count','air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'wind_speed', 'hour', 'weekday']]
x_test3 = Merge2_meter3[['site_id','building_id','square_feet', 'year_built',
       'floor_count','air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'wind_speed', 'hour', 'weekday']]

In [None]:
del Merge2

In [None]:
y2_pred0 = clf0.predict(x_test0)


In [None]:
y2_pred1 = clf1.predict(x_test1)

In [None]:
y2_pred2 = clf2.predict(x_test2)

In [None]:
y2_pred3 = clf3.predict(x_test3)

In [None]:
Merge2_meter0['meter_reading'] = y2_pred0
Merge2_meter1['meter_reading'] = y2_pred1
Merge2_meter2['meter_reading'] = y2_pred2
Merge2_meter3['meter_reading'] = y2_pred3

In [None]:
Merge2_meter0 = Merge2_meter0[['row_id','building_id','meter','meter_reading']]
Merge2_meter1 = Merge2_meter1[['row_id','building_id','meter','meter_reading']]
Merge2_meter2 = Merge2_meter2[['row_id','building_id','meter','meter_reading']]
Merge2_meter3 = Merge2_meter3[['row_id','building_id','meter','meter_reading']]

In [None]:
Merge2_final = pd.concat([Merge2_meter0,Merge2_meter1,Merge2_meter2,Merge2_meter3])

In [None]:
del Merge2_meter0, Merge2_meter1, Merge2_meter2, Merge2_meter3

In [None]:
Merge2_final['meter_reading_real']=np.expm1(Merge2_final['meter_reading'])

In [None]:
Merge2_final

In [None]:
submission = Sample_submission.merge(Merge2_final, on=["row_id"], how="left")

In [None]:
submission = submission[["row_id","meter_reading_real"]]

In [None]:
submission = submission.rename (columns= {"meter_reading_real" : "meter_reading"})

In [None]:
submission

In [None]:
submission = reduce_mem_usage(submission)

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
from IPython.display import FileLink
FileLink(r'submission.csv')