In [1]:
import random 
random.seed(123)

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
import operator as op
import seaborn as sns


In [2]:
data = pd.read_csv("dataset.txt",sep=";")
data.head()

  data = pd.read_csv("dataset.txt",sep=";")


Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [3]:
data.isnull().sum()

Date                         0
Time                         0
Global_active_power          0
Global_reactive_power        0
Voltage                      0
Global_intensity             0
Sub_metering_1               0
Sub_metering_2               0
Sub_metering_3           25979
dtype: int64

In [4]:
missing = ['Sub_metering_3']
for i in missing:
    data.loc[data.loc[:,i].isnull(),i]=data.loc[:,i].mean()


In [5]:
data.isnull().sum()

Date                     0
Time                     0
Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64

In [6]:
data['Date'] = data['Date'].astype(str)
data['Time'] = data['Time'].astype(str)
data.replace(['?', 'nan', np.nan], -1, inplace=True) 
num_col= ['Global_active_power', 'Global_reactive_power', 'Voltage', 
           'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
for i in num_col:
    data[i] = pd.to_numeric(data[i])

In [7]:
for i in num_col:
    mean_ = data.loc[:,i].mean()
    data.loc[data.loc[:,i] == -1,i]=mean_

In [8]:
data.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [9]:
data.corr()

  data.corr()


Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
Global_active_power,1.0,0.247031,-0.397305,0.998888,0.484401,0.434569,0.638553
Global_reactive_power,0.247031,1.0,-0.110175,0.266119,0.123105,0.139225,0.089608
Voltage,-0.397305,-0.110175,1.0,-0.408948,-0.194863,-0.16644,-0.266718
Global_intensity,0.998888,0.266119,-0.408948,1.0,0.489298,0.440347,0.626542
Sub_metering_1,0.484401,0.123105,-0.194863,0.489298,1.0,0.054721,0.102571
Sub_metering_2,0.434569,0.139225,-0.16644,0.440347,0.054721,1.0,0.080872
Sub_metering_3,0.638553,0.089608,-0.266718,0.626542,0.102571,0.080872,1.0


In [10]:
#Dropping data whose year is <2006
data = data[pd.DatetimeIndex(data['Date']).year > 2006]

  data = data[pd.DatetimeIndex(data['Date']).year > 2006]


In [11]:
data.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
21996,1/1/2007,00:00:00,2.58,0.136,241.97,10.6,0.0,0.0,0.0
21997,1/1/2007,00:01:00,2.552,0.1,241.75,10.4,0.0,0.0,0.0
21998,1/1/2007,00:02:00,2.55,0.1,241.64,10.4,0.0,0.0,0.0
21999,1/1/2007,00:03:00,2.55,0.1,241.71,10.4,0.0,0.0,0.0
22000,1/1/2007,00:04:00,2.554,0.1,241.98,10.4,0.0,0.0,0.0


In [12]:
models = {}
df1 = data
df1.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
21996,1/1/2007,00:00:00,2.58,0.136,241.97,10.6,0.0,0.0,0.0
21997,1/1/2007,00:01:00,2.552,0.1,241.75,10.4,0.0,0.0,0.0
21998,1/1/2007,00:02:00,2.55,0.1,241.64,10.4,0.0,0.0,0.0
21999,1/1/2007,00:03:00,2.55,0.1,241.71,10.4,0.0,0.0,0.0
22000,1/1/2007,00:04:00,2.554,0.1,241.98,10.4,0.0,0.0,0.0


# Data Preprocessing

In [13]:
def preprocess(filename):

    data = pd.read_csv(filename, sep=';', parse_dates={'Datetime': ['Date', 'Time']}, infer_datetime_format=True, header=1,
                       names=['Date',
                              'Time',
                              'Active_Power',
                              'Reactive_Power',
                              'Voltage',
                              'Current',
                              'Meter_1',
                              'Meter_2',
                              'Meter_3'])

    # Cleans up data by removing null values with mean and dropping extra columns
    # Needed to save space
#     data = data.drop(columns=['Reactive_Power', 'Voltage', 'Current'])

    data['Active_Power'] = pd.to_numeric(data['Active_Power'], errors='coerce')
    data['Meter_1'] = pd.to_numeric(data['Meter_1'], errors='coerce')
    data['Meter_2'] = pd.to_numeric(data['Meter_2'], errors='coerce')
    data['Meter_3'] = pd.to_numeric(data['Meter_3'], errors='coerce')
    data['Voltage'] = pd.to_numeric(data['Voltage'], errors='coerce')
    data['Current'] = pd.to_numeric(data['Current'], errors='coerce')

    data['Active_Power'].fillna(data['Active_Power'].mean(), inplace=True)
    data['Meter_1'].fillna(data['Meter_1'].mean(), inplace=True)
    data['Meter_2'].fillna(data['Meter_2'].mean(), inplace=True)
    data['Meter_3'].fillna(data['Meter_3'].mean(), inplace=True)
    data['Voltage'].fillna(data['Voltage'].mean(), inplace=True)
    data['Current'].fillna(data['Current'].mean(), inplace=True)
    
    

    # Aggregate Power over an hour duration
    data.set_index('Datetime', inplace=True)
    data = data.resample('h').mean()

    # Split Datetime Column
    data['Year'] = data.index.year
    data['Month'] = data.index.month
    data['Day'] = data.index.day
    data['Hour'] = data.index.hour

    data.reset_index(inplace=True)
    data.drop(columns='Datetime', inplace=True)

    return data

In [14]:
dataset = "./dataset.txt"
data = preprocess(dataset)

  data = pd.read_csv(filename, sep=';', parse_dates={'Datetime': ['Date', 'Time']}, infer_datetime_format=True, header=1,
  data = data.resample('h').mean()


In [15]:
x = data.drop('Active_Power',axis=1)
y = data['Active_Power']

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size = 0.7, random_state = 10)

# Linear Regression

In [17]:
def prediction(X_train,Y_train,X_test,Y_test):
    lin_reg = LinearRegression()
    lin_reg.fit(X_train,Y_train)
    pred = lin_reg.predict(X_test)
    rmse = skm.mean_squared_error(Y_test,pred,squared=False)
    r2 = skm.r2_score(Y_test,pred)
    return [rmse,r2]

In [18]:
models["linear regression"] = prediction(X_train,Y_train,X_test,Y_test)

# Linear Regression With Ridge

In [19]:
def ridge_pred(X_train,Y_train,X_test,Y_test):
    ridge_reg = Ridge(alpha=0.001,normalize=True)
    ridge_reg.fit(X_train,Y_train)
    pred = ridge_reg.predict(X_test)
    rmse = skm.mean_squared_error(Y_test,pred,squared=False)
    r2 = skm.r2_score(Y_test,pred)
    return [rmse,r2]

In [20]:
models["Ridge Regression"] = ridge_pred(X_train,Y_train,X_test,Y_test)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


# Linear Regression With Lasso

In [21]:
def lasso_pred(X_train,Y_train,X_test,Y_test):
    lasso_reg = Lasso(alpha=0.1)
    lasso_reg.fit(X_train,Y_train)
    pred = lasso_reg.predict(X_test)
    rmse = skm.mean_squared_error(Y_test,pred,squared=False)
    r2 = skm.r2_score(Y_test,pred)
    return [rmse,r2]

In [22]:
models["Lasso regression"] = lasso_pred(X_train,Y_train,X_test,Y_test)

In [23]:
models_df = pd.DataFrame.from_dict(models,orient='index',columns=['Rmse','R_sq'])
models_df

Unnamed: 0,Rmse,R_sq
linear regression,0.023466,0.999288
Ridge Regression,0.023523,0.999285
Lasso regression,0.038635,0.998071


# SVR

In [48]:
def svr(X_train,Y_train,X_test,ker):
    #C=5,gamma=0.0009 1.719
    reg=SVR(kernel=ker,C=0.1,gamma=0.00001)
    reg.fit(X_train,Y_train)
    return np.asarray(reg.predict(X_test)).reshape(-1,1)

In [49]:
sc_X = StandardScaler()
sc_Y = StandardScaler()
X = sc_X.fit_transform(X_train)
Y = sc_Y.fit_transform(np.asarray(Y_train).reshape(-1,1))
X_test =sc_X.fit_transform(X_test)

Y_pred=svr(X,Y,X_test,'rbf')
Y_pred=sc_Y.inverse_transform(Y_pred)
rmse = skm.mean_squared_error(Y_test,Y_pred,squared=False)
r2 = skm.r2_score(Y_test,Y_pred)
print("rmse for svr: "+str(rmse))
print("r2 error for svr: "+str(r2))

# Y_pred=sc_Y.inverse_transform(svr(X,Y,X_test,'linear'))
# rmse = skm.mean_squared_error(y_test,y_pred,squared=False)
# r2 = skm.r2_score(y_test,y_pred)
# print("rmse for svr: "+str(rmse))
# print("r2 error for svr: "+str(r2))

  y = column_or_1d(y, warn=True)


rmse for svr: 0.8396008156024696
r2 error for svr: 0.08886010045250481


In [50]:
Y_pred=svr(X,Y,X_test,'linear')
Y_pred=sc_Y.inverse_transform(Y_pred)
rmse = skm.mean_squared_error(Y_test,Y_pred,squared=False)
r2 = skm.r2_score(Y_test,Y_pred)
print("rmse for svr: "+str(rmse))
print("r2 error for svr: "+str(r2))

  y = column_or_1d(y, warn=True)


rmse for svr: 0.03183179210797186
r2 error for svr: 0.9986903310201646


# Grid Search {'C': 1.5, 'epsilon': 0.1, 'gamma': 1e-07, 'kernel': 'linear'}

In [27]:
# parameters = {'kernel': ('linear', 'rbf'), 'C':[2e-6,2e-5,2e-4,2e-3,2e-2,2e-1,0,2e1,2e2,2e3],'gamma': [2e-6,2e-5,2e-4,2e-3,2e-2,2e-1,0,2e1,2e2,2e3,2e4,2e5],'epsilon':np.linspace(0.001, 10,20)}
# svr = SVR()
# clf = GridSearchCV(svr, parameters)
# clf.fit(X,Y)
# clf.best_params_

In [28]:
clf.best_params_

NameError: name 'clf' is not defined