In [64]:
import pandas as pd
from scipy.stats.stats import pearsonr
from scipy import stats
import statsmodels.formula.api as smf
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

gas_fname = "/vagrant/scripts/gas.csv"
elec_fname = "/vagrant/scripts/elec_climatezone.csv"

def read_data(fname, energy_type):
    df = pd.read_csv(fname)
    if energy_type == 'gas':
        df = df[['heating_coefficient',
                 'intercept_coefficient',
                 'project_climate_zone',
                 'natural_gas_savings_thm']]
    else:
        data = df[['heating_coefficient',
                   'cooling_coefficient',
                   'intercept_coefficient',
                   'project_climate_zone',
                   'electricity_savings_kwh']]
    return df

def train_test_split(df, train_frac=0.8):
    
    mask = np.random.rand(len(df)) < train_frac
    train = df[mask]
    test = df[~mask]
    return train, test

class SavingPrediction:
    def __init__(self, energy_type, 
                 include_climate_zone=False,
                 response_var_name=None, 
                 formula=None):
        self.energy_type = energy_type
        self.response_var_name = response_var_name
        if formula:
            self.formula = formula
        elif energy_type == 'gas':
            self.formula = "natural_gas_savings_thm ~ heating_coefficient + intercept_coefficient"
            self.response_var_name = 'natural_gas_savings_thm'
        elif energy_type == 'electricity':
            self.formula = "electricity_savings_kwh ~ cooling_coefficient +\
            heating_coefficient + intercept_coefficient"
            self.response_var_name = 'electricity_savings_kwh'
        else:
            raise ValueError("")
        if not formula and include_climate_zone:
            self.formula = formula + " + project_climate_zone"
            
        self.fitted_model = None
        self.model_obj = None
        
    def rmse_train(self, fitted_model_ols):
        return np.sqrt(fitted_model_ols.ssr/fitted_model_ols.nobs)

    def pred(self, df):
        pred = self.fitted_model.predict(df)
        # Series
        return pred
        
    def out_of_sample_stats(self, df, response_var_name=None):
        prediction = self.pred(df)
        if response_var_name is None:
            reponse_var_name = self.response_var_name
        actual = df[reponse_var_namue]
        rmse = sqrt(mean_squared_error(prediction, actual))
        
        savings = 0.0
        for idx, value in enumerate(prediction):
            if value > 0.0 and actual.iloc[idx] > 0.0:
                savings = savings + 1.0
                
        savings_precision = savings / len(df)
        print (savings, ' ', len(df))
        return { 'rmse': rmse,
                 'savings_precision' : savings_precision }
    
    
    def fit(self, data_frame):
        ols_model = smf.ols(formula=self.formula, data=data_frame)
        self.fitted_model = ols_model.fit()
        self.model_obj = ols_model


In [66]:
df = read_data(elec_fname, energy_type='electricity')
df = df.dropna()
df = df.drop_duplicates()
df = df[df.electricity_savings_kwh > 0.0]
train_df, test_df = train_test_split(df)
from math import sqrt

model = SavingPrediction(energy_type='electricity')
model.fit(train_df)
print(model.out_of_sample_stats(test_df))

test_df['electricity_savings_kwh'].mean()

# print(model.fitted_model.params)
# print(model.rmse_train(model.fitted_model))
print(model.fitted_model.summary())
#print pd.DataFrame({
#    'actual' : test_df['electricity_savings_kwh'],
#    'predicted' : model.pred(test_df)
#})

(222.0, ' ', 222)
{'savings_precision': 1.0, 'rmse': 901.5211404478257}
                               OLS Regression Results                              
Dep. Variable:     electricity_savings_kwh   R-squared:                       0.287
Model:                                 OLS   Adj. R-squared:                  0.284
Method:                      Least Squares   F-statistic:                     118.4
Date:                     Fri, 27 Oct 2017   Prob (F-statistic):           1.93e-64
Time:                             02:55:14   Log-Likelihood:                -7410.0
No. Observations:                      887   AIC:                         1.483e+04
Df Residuals:                          883   BIC:                         1.485e+04
Df Model:                                3                                         
Covariance Type:                 nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975