In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn import linear_model

In [2]:
df = pd.read_csv('../Dataset/Dataset.csv')
print(df.shape)
print(df.describe())

(39601, 3)
                WN1           WP1           VTH
count  39601.000000  39601.000000  39601.000000
mean      50.500000     50.500000      0.672020
std       28.723176     28.723176      0.024614
min        1.000000      1.000000      0.586869
25%       25.500000     25.500000      0.659298
50%       50.500000     50.500000      0.676322
75%       75.500000     75.500000      0.686173
max      100.000000    100.000000      0.796247


In [3]:
df['1/WN1'] = 1/df['WN1']
# df['WP1_2'] = df['WP1'] * df['WP1']
df['log WP1'] = np.log(df['WP1'])
print(df['log WP1'])

0        0.000000
1        0.405465
2        0.693147
3        0.916291
4        1.098612
           ...   
39596    4.584967
39597    4.590057
39598    4.595120
39599    4.600158
39600    4.605170
Name: log WP1, Length: 39601, dtype: float64


In [4]:
target_column = ['VTH'] 
exclude_column = ['WN1','WP1']
predictors = list(set(list(df.columns))-set(target_column) - set(list(exclude_column)))

X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=40)
print(X_train.shape)
print(X_test.shape)

(31680, 2)
(7921, 2)


In [5]:
# Fit Random Forest model
print('>> Random Forest Model')
model_rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=100)
model_rf.fit(X_train, y_train.ravel()) 
pred_train_rf= model_rf.predict(X_train)

print('RMSE Train = ', np.sqrt(mean_squared_error(y_train,pred_train_rf)))
print('R2 Train = ', r2_score(y_train, pred_train_rf))

pred_test_rf = model_rf.predict(X_test)
print('RMSE Test = ', np.sqrt(mean_squared_error(y_test,pred_test_rf)))
print('R2 Test = ', r2_score(y_test, pred_test_rf))

>> Random Forest Model
RMSE Train =  0.00011771857421145462
R2 Train =  0.9999771477824382
RMSE Test =  0.0003444964287651787
R2 Test =  0.9998033437861744


In [6]:
print(predictors)
print(model_rf.feature_importances_)

['log WP1', '1/WN1']
[0.68233181 0.31766819]


In [7]:
print('>> Linear Model')
model_linr = linear_model.LinearRegression()
model_linr.fit(X_train, y_train.ravel()) 
pred_train_lr= model_linr.predict(X_train)

print('RMSE Train = ', np.sqrt(mean_squared_error(y_train,pred_train_lr)))
print('R2 Train = ', r2_score(y_train, pred_train_lr))

pred_test_lr = model_linr.predict(X_test)
print('RMSE Test = ', np.sqrt(mean_squared_error(y_test,pred_test_lr)))
print('R2 Test = ', r2_score(y_test, pred_test_lr))

>> Linear Model
RMSE Train =  0.004537004652786673
R2 Train =  0.9660549218679363
RMSE Test =  0.004346243902786602
R2 Test =  0.9686984088798837


In [8]:
# print('Variables = ',predictors)
# print('Coeff =', model_linr.coef_)
# print('Intercept =', model_linr.intercept_)

print('Equation: VTH = ', end='')
for i in range(0,len(predictors)):
    print( model_linr.coef_[i], '*', predictors[i], ' + ', end='')
print(model_linr.intercept_)

Equation: VTH = 0.022466444899813302 * log WP1  + 0.12769458850603652 * 1/WN1  + 0.5838724488838827


In [9]:
y_pred = 0.12769458850603654 * (np.array(X_test)[:,1])  + 0.02246644489981312 * (np.array(X_test)[:,0])  + 0.5838724488838833

print('R2 Test with eqn = ', r2_score(y_test, y_pred))

R2 Test with eqn =  0.9686984088798837


In [16]:
VTHS = np.array([0.5])
WP1 = list(range(1, 100))
log_WP1 = np.log(WP1*10)
print('WP1 = ', WP1)

WP1 =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [17]:
for VTH in VTHS:
    print('VTH = ', VTH)
    WN1 = 0.12769458850603652/(VTH - (0.022466444899813302 * log_WP1) - 0.5838724488838827)
    print('WN1 = ', WN1)

VTH =  0.5
WN1 =  [-1.52248551 -1.28407246 -1.17631928 -1.11021825 -1.06384854 -1.02874215
 -1.00081869 -0.97782742 -0.95840708 -0.94167724 -0.92703859 -0.9140664
 -0.90244965 -0.89195441 -0.88240065 -0.87364713 -0.86558119 -0.85811165
 -0.85116377 -0.84467555 -0.83859509 -0.83287852 -0.82748845 -0.82239278
 -0.81756378 -0.81297733 -0.8086123  -0.80445017 -0.80047452 -0.7966708
 -0.79302606 -0.78952867 -0.78616823 -0.78293534 -0.77982152 -0.77681906
 -0.77392096 -0.77112085 -0.76841289 -0.76579173 -0.76325247 -0.76079058
 -0.7584019  -0.75608258 -0.75382906 -0.75163802 -0.7495064  -0.74743134
 -0.74541018 -0.74344042 -0.74151974 -0.73964598 -0.73781709 -0.73603115
 -0.73428638 -0.73258107 -0.73091365 -0.7292826  -0.72768651 -0.72612405
 -0.72459394 -0.72309499 -0.72162607 -0.7201861  -0.71877405 -0.71738897
 -0.71602992 -0.71469604 -0.71338648 -0.71210045 -0.71083719 -0.70959599
 -0.70837614 -0.70717699 -0.70599791 -0.70483829 -0.70369756 -0.70257517
 -0.70147058 -0.70038329 -0.6993128