In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score
import matplotlib.pyplot as plt
from math import sqrt
import seaborn as sns
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [17]:
def calculateR2(y_test,y_pred):
    v = ((y_test - y_test.mean())**2).sum()
    u = ((y_test - y_pred)**2).sum()
    return 1-u/v

def calculateMetric(y_pred,y_test):
    print("MAE=",mean_absolute_error(y_test,y_pred))
    print("RMSE=",sqrt(mean_squared_error(y_test,y_pred)))
    print("R2 Score",calculateR2(y_test,y_pred))

def accuracy(y_pred,data):
    correct=0
    i=0
    for index,row in data.iterrows():
        if(y_pred[i]>=row['block_min'] and y_pred[i]<=row['past_max']):
            correct+=1
        i+=1
    acc = correct/data.shape[0]
    print("Accuracy", acc)
    return acc 

def simpleRandomForest(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test): 
    regr = RandomForestRegressor()
    regr.fit(X_train_scaled,y_train)
    print("Training")
    y_pred = regr.predict(X_train_scaled)
    accuracy(y_pred,X_train)
    calculateMetric(y_pred,y_train)
    print("Testing")
    y_pred = regr.predict(X_test_scaled)
    calculateMetric(y_pred,y_test)
    accuracy(y_pred,X_test)
    return regr

def RFgridSearchCV(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test):
    regr = RandomForestRegressor()
    estimators = [50,75,100,150]
    max_depths = [3,6,9,12]
    parameters = {"n_estimators":estimators,"max_depth":max_depths}
    reg = GridSearchCV(estimator=regr,param_grid=parameters,cv=3,scoring="neg_root_mean_squared_error",n_jobs=-1,return_train_score=True)
    reg.fit(X_train_scaled,y_train)
    print("Model with best parameters :\n",reg.best_params_)
    train_auc = reg.cv_results_['mean_train_score'].reshape(len(estimators), len(max_depths))
    cv_auc = reg.cv_results_['mean_test_score'].reshape(len(estimators), len(max_depths))

    f, axes = plt.subplots(1, 2,figsize=(30,10))

    for i in range(2):
        title = train_auc if i == 0 else cv_auc
        sns.heatmap(title, xticklabels=estimators, yticklabels=max_depths, annot=True, ax=axes[i])
        axes[i].set_title(f"Grid search {'Train' if i==0 else 'CV'}")
        axes[i].set_xlabel("n_estimators")
        axes[i].set_ylabel("maximum_depth")

def bestRFRegressor(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test):
    #n_estimators=150,max_depth=12
    bestRF = RandomForestRegressor(n_estimators=200,max_depth=15)
    bestRF.fit(X_train_scaled,y_train)
    print("Training Data")
    y_pred = bestRF.predict(X_train_scaled)
    accuracy(y_pred,X_train)
    calculateMetric(y_pred,y_train)
    y_pred = bestRF.predict(X_test_scaled)
    print("Testing Data")
    accuracy(y_pred,X_test)
    calculateMetric(y_pred,y_test)
    return bestRF

In [7]:
data = pd.read_csv("../Data/unscaled.csv")
del data['Unnamed: 0']
scaler = StandardScaler()
data = data[:500000]
y = data['block_min']
X = data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
X_train_scaled = X_train.copy()
X_train_scaled = X_train_scaled.drop(['block_min'], axis=1)
X_test_scaled = X_test.copy()
X_test_scaled = X_test_scaled.drop(['block_min'], axis=1)
X_train_scaled[X_train_scaled.columns[:]] = scaler.fit_transform(X_train_scaled[X_train_scaled.columns[:]])
X_test_scaled[X_test_scaled.columns[:]] = scaler.fit_transform(X_test_scaled[X_test_scaled.columns[:]])

In [16]:
simpleRF = simpleRandomForest(y_train,y_test,X_train_scaled,X_test_scaled,X_train,X_test)

Training
Accuracy 0.9765657142857143
MAE= 178802.3933875172
RMSE= 15787422.765401194
R2 Score 0.9999975511952016
Testing
MAE= 937173095.4471967
RMSE= 2449901476.079192
R2 Score 0.9398692128119994
Accuracy 0.5018


## Saving the model

In [9]:
import pickle
fileName = "../weights/randomForest.sav"
pickle.dump(simpleRF, open(fileName, "wb"))

In [10]:
loaded_model = pickle.load(open(fileName, 'rb'))

In [14]:
y_pred = loaded_model.predict(X_test_scaled)
calculateMetric(y_pred, y_test)
accuracy(y_pred, X_test)

MAE= 937173095.4471967
RMSE= 2449901476.079192
R2 Score 0.9398692128119994
Accuracy 0.5018


0.5018