In [None]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection  import train_test_split
import numpy as np
from scipy.stats import norm # for scientific Computing
from scipy import stats, integrate
import matplotlib.pyplot as plt

In [None]:
#train =  pd.read_csv('../input/ashrae-great-energy-prediction/train.csv')
ASHRAE_test = pd.read_csv('../input/ashrae-energy-prediction/test.csv')
ASHRAE_train = pd.read_csv('../input/ashrae-energy-prediction/train.csv')
weather_train = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv')
building_meta = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
weather_test = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv')
#weather_train = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv')
#building = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv')
#weather_test = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv')

In [None]:
ASHRAE_train.info()

In [None]:
weather_train.info()

In [None]:
## Function to reduce the DF size
def reduce_memory_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
reduce_memory_usage(building_meta)
reduce_memory_usage(weather_train)
reduce_memory_usage(ASHRAE_train)

reduce_memory_usage(weather_test)
reduce_memory_usage(ASHRAE_test)

In [None]:
ASHRAE_train.describe()

In [None]:
print('Size of the building dataset is', building_meta.shape)
print('Size of the weather_train dataset is', weather_train.shape)
print('Size of the train dataset is', ASHRAE_train.shape)

In [None]:
ASHRAE_train.describe()

In [None]:
building_meta.describe()

In [None]:
primary_use_numbersOfUniqueValue = building_meta['primary_use'].nunique()
 
print('Number of unique values in column "primary_use" of the building_meta : ')
print(primary_use_numbersOfUniqueValue)
primary_use_element = building_meta['primary_use'].unique()
 
print('Unique element in column "primary_use" of the building_meta : ')
print(primary_use_element)

In [None]:
print('Columns of the building dataset is', building_meta.columns)
print('Columns of the weather_train dataset is', weather_train.columns)
print('Columns of the train dataset is', ASHRAE_train.columns)

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
sns.heatmap(building_meta.isnull(), yticklabels=False,cmap='viridis')

In [None]:
print("Percentage of missing values in the building_meta dataset")
building_meta.isna().sum()/len(building_meta)*100


In [None]:
print("Percentage of missing values in the train dataset")
ASHRAE_train.isna().sum()/len(ASHRAE_train)*100

In [None]:
#pd.merge(df1, df2, on='employee')
BuildingTrainMerge=building_meta.merge(ASHRAE_train,left_on='building_id',right_on='building_id',how='left')
BuildingTrainMerge.shape

In [None]:
BTW_train=BuildingTrainMerge.merge(weather_train,left_on=['site_id','timestamp'],right_on=['site_id','timestamp'],how='left')
BTW_train.shape

In [None]:
BTW_train.columns

In [None]:
print("Percentage of missing values in the BTW_train dataset")
BTW_train.isna().sum()/len(BTW_train)*100

In [None]:
BTW_train.hist('sea_level_pressure')
BTW_train[['sea_level_pressure']].describe()

In [None]:
BTW_train.hist('cloud_coverage')
BTW_train[['cloud_coverage']].describe()

In [None]:
BTW_train.hist('precip_depth_1_hr')
BTW_train[['precip_depth_1_hr']].describe()

In [None]:
BTW_train.hist('wind_speed')
BTW_train[['wind_speed']].describe()


In [None]:
BTW_train.hist(column='air_temperature')
BTW_train[['air_temperature']].describe()

In [None]:
sns.boxplot(x = 'meter', y = 'meter_reading', data = BTW_train)

In [None]:
def outlier_function(df, col_name):
    ''' this function detects first and third quartile and interquartile range for a given column of a dataframe
    then calculates upper and lower limits to determine outliers conservatively
    returns the number of lower and uper limit and number of outliers respectively
    '''
    first_quartile = np.percentile(
        np.array(df[col_name].tolist()), 25)
    third_quartile = np.percentile(
        np.array(df[col_name].tolist()), 75)
    IQR = third_quartile - first_quartile
                      
    upper_limit = third_quartile+(3*IQR)
    lower_limit = first_quartile-(3*IQR)
    outlier_count = 0
                      
    for value in df[col_name].tolist():
        if (value < lower_limit) | (value > upper_limit):
            outlier_count +=1
    return lower_limit, upper_limit, outlier_count

In [None]:
print("{} percent of {} are outliers."
      .format((
              (100 * outlier_function(BTW_train, 'meter_reading')[2])
               / len(BTW_train['meter_reading'])),
              'meter_reading'))

In [None]:
# Distribution of the meter reading in meters without zeros
plt.figure(figsize=(12,10))

#list of different meters
meters = sorted(BTW_train['meter'].unique().tolist())

# plot meter_reading distribution for each meter
for meter_type in meters:
    subset = BTW_train[BTW_train['meter'] == meter_type]
    sns.kdeplot(np.log1p(subset["meter_reading"]), 
                label=meter_type, linewidth=2)

# set title, legends and labels
plt.ylabel("Density")
plt.xlabel("Meter_reading")
plt.legend(['electricity', 'chilled water', 'steam', 'hot water'])
plt.title("Density of Logartihm(Meter Reading + 1) Among Different Meters", size=14)

In [None]:
BTW_train.columns

In [None]:
corrmat=BTW_train.corr()
fig,ax=plt.subplots(figsize=(12,10))
sns.heatmap(corrmat,annot=True,annot_kws={'size': 12})

In [None]:
BTW_train = BTW_train.drop(columns=['year_built', 'floor_count', 'wind_direction', 'dew_temperature'])
BTW_train ['timestamp'] =  pd.to_datetime(BTW_train['timestamp'])
BTW_train['Month']=pd.DatetimeIndex(BTW_train['timestamp']).month
BTW_train['Day']=pd.DatetimeIndex(BTW_train['timestamp']).day

In [None]:
BTW_train= BTW_train.groupby(['meter',BTW_train['building_id'],'primary_use',BTW_train['Month'], BTW_train['Day']]).agg({'meter_reading':'sum', 'air_temperature': 'mean', 'wind_speed': 'mean', 'precip_depth_1_hr': 'mean', 'cloud_coverage': 'mean', 'square_feet': 'mean'})


In [None]:
BTW_train.columns

In [None]:
BTW_train = BTW_train.reset_index()

In [None]:
BTW_train.describe()

In [None]:
BTW_train['wind_speed'] = BTW_train['wind_speed'].astype('float32')
BTW_train['air_temperature'] = BTW_train['air_temperature'].astype('float32')
BTW_train['precip_depth_1_hr'] = BTW_train['precip_depth_1_hr'].astype('float32')
BTW_train['cloud_coverage'] = BTW_train['cloud_coverage'].astype('float32')

In [None]:
BTW_train['precip_depth_1_hr'].fillna(method='ffill', inplace = True)
BTW_train['cloud_coverage'].fillna(method='bfill', inplace = True)

BTW_train['wind_speed'].fillna(BTW_train['wind_speed'].mean(), inplace=True)
BTW_train['air_temperature'].fillna(BTW_train['air_temperature'].mean(), inplace=True)
BTW_train.isnull().sum()

In [None]:
BTW_train.shape

In [None]:
BTW_train.dtypes

In [None]:
BTW_train.columns

In [None]:
BTW_linearR = pd.get_dummies(BTW_train, columns=['primary_use'])

In [None]:
BTW_linearR.columns

In [None]:
X =BTW_linearR[['building_id', 'meter', 'air_temperature', 'wind_speed', 'precip_depth_1_hr', 'cloud_coverage',
       'square_feet', 'primary_use_Education', 'primary_use_Entertainment/public assembly',
       'primary_use_Food sales and service', 'primary_use_Healthcare',
       'primary_use_Lodging/residential',
       'primary_use_Manufacturing/industrial', 'primary_use_Office',
       'primary_use_Other', 'primary_use_Parking',
       'primary_use_Public services', 'primary_use_Religious worship',
       'primary_use_Retail', 'primary_use_Services',
       'primary_use_Technology/science', 'primary_use_Utility',
       'primary_use_Warehouse/storage', 'Month', 'Day']]

# Create target variable
y = BTW_linearR['meter_reading']

# Train, test, split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .20, random_state= 0)

In [None]:
# Fit
# Import model
from sklearn.linear_model import LinearRegression

# Create linear regression object
regressor = LinearRegression()

# Fit model to training data
regressor.fit(X_train,y_train)

In [None]:
# Predicting test set results
y_pred = regressor.predict(X_test)

In [None]:
print('Accuracy %d', regressor.score(X_test, y_test))

In [None]:
#Calculate R Sqaured
print('R^2 =',metrics.explained_variance_score(y_test,y_pred))#Calculate R Sqaured
print('R^2 =',metrics.explained_variance_score(y_test,y_pred))

In [None]:
cdf = pd.DataFrame(data = regressor.coef_, index = X.columns, columns = ['Coefficients'])
cdf

In [None]:
cdf.Coefficients.nlargest(10).plot(kind='barh')

In [None]:
import statsmodels.api as sm
from scipy import stats
X =BTW_linearR[['building_id', 'meter', 'air_temperature', 'wind_speed', 'precip_depth_1_hr', 'cloud_coverage',
       'square_feet', 'primary_use_Education', 'primary_use_Entertainment/public assembly',
       'primary_use_Food sales and service', 'primary_use_Healthcare',
       'primary_use_Lodging/residential',
       'primary_use_Manufacturing/industrial', 'primary_use_Office',
       'primary_use_Other', 'primary_use_Parking',
       'primary_use_Public services', 'primary_use_Religious worship',
       'primary_use_Retail', 'primary_use_Services',
       'primary_use_Technology/science', 'primary_use_Utility',
       'primary_use_Warehouse/storage', 'Month', 'Day']]

# Create target variable
y = BTW_linearR['meter_reading']
 
 
 
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())