In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score
import matplotlib.pyplot as plt
from math import sqrt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [2]:
def calculateR2(y_test,y_pred):
    v = ((y_test - y_test.mean())**2).sum()
    u = ((y_test - y_pred)**2).sum()
    return 1-u/v

def calculateMetric(y_pred,y_test):
    print("MAE=",mean_absolute_error(y_test,y_pred))
    print("RMSE=",sqrt(mean_squared_error(y_test,y_pred)))
    print("R2 Score",calculateR2(y_test,y_pred))

def linearRegression(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test): 
    simpleLinearRegressor = LinearRegression()
    simpleLinearRegressor.fit(X_train_scaled,y_train)
    print("Training Data");
    y_pred = simpleLinearRegressor.predict(X_train_scaled)
    accuracy(y_pred,X_train)
    calculateMetric(y_pred,y_train)
    y_pred = simpleLinearRegressor.predict(X_test_scaled)
    print("Testing Data");
    accuracy(y_pred,X_test)
    calculateMetric(y_pred,y_test)
    return simpleLinearRegressor
  


def lassoRegression(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test):
    lassoRegressor = Lasso()
    alphas = [1000,1e5,1e7,1e9,1e10]
    max_iters = [250,500,1000,2000,5000]
    parameters = {"alpha":alphas,"max_iter":max_iters}
    reg = GridSearchCV(estimator=lassoRegressor,param_grid=parameters,cv=3,scoring="neg_root_mean_squared_error",n_jobs=-1,return_train_score=True)
    reg.fit(X_train_scaled,y_train)
    print("Model with best parameters :\n",reg.best_params_)
    train_auc = reg.cv_results_['mean_train_score'].reshape(len(alphas), len(max_iters))
    cv_auc = reg.cv_results_['mean_test_score'].reshape(len(alphas), len(max_iters))

    f, axes = plt.subplots(1, 2,figsize=(30,10))

    for i in range(2):
        title = train_auc if i == 0 else cv_auc
        sns.heatmap(title, xticklabels=alphas, yticklabels=max_iters, annot=True, ax=axes[i])
        axes[i].set_title(f"Grid search {'Train' if i==0 else 'CV'}")
        axes[i].set_xlabel("alpha")
        axes[i].set_ylabel("maximum_iterations")

def bestLassoRegressor(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test):
    bestLasso = Lasso(max_iter=1000,alpha=100000)
    bestLasso.fit(X_train_scaled,y_train)
    print("Training Data")
    y_pred = bestLasso.predict(X_train_scaled)
    accuracy(y_pred,X_train)
    calculateMetric(y_pred,y_train)
    y_pred = bestLasso.predict(X_test_scaled)
    print("Testing Data")
    accuracy(y_pred,X_test)
    calculateMetric(y_pred,y_test)
    return bestLasso


def ridgeRegression(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test):
    ridgeRegressor = Ridge()
    alphas = [1000,1e5,1e7,1e9,1e10]
    max_iters = [250,500,1000,2000,5000]
    parameters = {"alpha":alphas,"max_iter":max_iters}
    reg = GridSearchCV(estimator=ridgeRegressor,param_grid=parameters,cv=3,scoring="neg_root_mean_squared_error",n_jobs=-1,return_train_score=True)
    reg.fit(X_train_scaled,y_train)
    print("Model with best parameters :\n",reg.best_params_)
    train_auc = reg.cv_results_['mean_train_score'].reshape(len(alphas), len(max_iters))
    cv_auc = reg.cv_results_['mean_test_score'].reshape(len(alphas), len(max_iters))

    f, axes = plt.subplots(1, 2,figsize=(30,10))

    for i in range(2):
        title = train_auc if i == 0 else cv_auc
        sns.heatmap(title, xticklabels=alphas, yticklabels=max_iters, annot=True, ax=axes[i])
        axes[i].set_title(f"Grid search {'Train' if i==0 else 'CV'}")
        axes[i].set_xlabel("alpha")
        axes[i].set_ylabel("maximum_iterations")

def bestRidgeRegressor(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test):
    bestRidge = Ridge(max_iter=250,alpha=1000)
    bestRidge.fit(X_train_scaled,y_train)
    print("Training Data")
    y_pred = bestRidge.predict(X_train_scaled)
    accuracy(y_pred,X_train)
    calculateMetric(y_pred,y_train)
    y_pred = bestRidge.predict(X_test_scaled)
    print("Testing Data")
    accuracy(y_pred,X_test)
    calculateMetric(y_pred,y_test)
    return bestRidge

def accuracy(y_pred,data):
    correct=0
    i=0
    for index,row in data.iterrows():
        if(y_pred[i]>=row['block_min'] and y_pred[i]<=row['past_max']):
            correct+=1
        i+=1
    acc = correct/data.shape[0]
    print("Accuracy", acc)
    return acc 

In [3]:
data = pd.read_csv("../Data/unscaled.csv")
del data['Unnamed: 0']
scaler = StandardScaler()
# data = data[:200000]
y = data['block_min']
X = data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
X_train_scaled = X_train.copy()
X_train_scaled = X_train_scaled.drop(['block_min'], axis=1)
X_test_scaled = X_test.copy()
X_test_scaled = X_test_scaled.drop(['block_min'], axis=1)
X_train_scaled[X_train_scaled.columns[:]] = scaler.fit_transform(X_train_scaled[X_train_scaled.columns[:]])
X_test_scaled[X_test_scaled.columns[:]] = scaler.fit_transform(X_test_scaled[X_test_scaled.columns[:]])

In [34]:
# print(df.head())
# lrmodel = linearRegression(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test)
# lassoRegression(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test)
# bestLassoRegressor(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test)
# ridgeRegression(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test)
# bestRidgeRegressor(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test)

Training Data
Accuracy 0.5035393962274884
MAE= 3474882180.236948
RMSE= 5381574602.26646
R2 Score 0.4933011919620248
Testing Data
Accuracy 0.5053895792417482
MAE= 3480044804.6214204
RMSE= 5390157755.152437
R2 Score 0.5034793565755726


In [4]:
lrmodel = linearRegression(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test)

Training Data
Accuracy 0.5033713153107493
MAE= 3463788347.413525
RMSE= 5372102326.350784
R2 Score 0.5006019127931023
Testing Data
Accuracy 0.5010121488024666
MAE= 3467271727.6328917
RMSE= 5412321018.006572
R2 Score 0.48645427739093905


In [5]:
lassomodel = bestLassoRegressor(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test)

Training Data
Accuracy 0.5033739011710068
MAE= 3463792673.4665756
RMSE= 5372102336.336195
R2 Score 0.5006019109365871
Testing Data
Accuracy 0.5010211993133679
MAE= 3467274585.981293
RMSE= 5412320918.45978
R2 Score 0.4864542962818468


In [6]:
ridgemodel = bestRidgeRegressor(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test)

Training Data
Accuracy 0.5034437193979601
MAE= 3463412911.6557345
RMSE= 5372107596.379643
R2 Score 0.5006009329742296
Testing Data
Accuracy 0.5011086875854142
MAE= 3466878125.0809517
RMSE= 5412392607.16137
R2 Score 0.48644069188954253


## Saving the models

In [7]:
import pickle
simpleName = '../weights/simpleLR.sav'
lassoName = '../weights/lassoLR.sav'
ridgeName = '../weights/ridgeLR.sav'

In [10]:
pickle.dump(lrmodel, open(simpleName, "wb"))
pickle.dump(lassomodel, open(lassoName, "wb"))
pickle.dump(ridgemodel, open(ridgeName, "wb"))

In [17]:
loaded_model = pickle.load(open(ridgeName, 'rb'))

In [18]:
y_pred = loaded_model.predict(X_test_scaled)
calculateMetric(y_pred, y_test)

MAE= 3466878125.0809517
RMSE= 5412392607.16137
R2 Score 0.48644069188954253
