In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt

from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import tree
from sklearn import linear_model

In [2]:
df = pd.read_csv('../Dataset/Dataset.csv')
print(df.shape)
print(df.describe())

(39601, 3)
                WN1           WP1           VTH
count  39601.000000  39601.000000  39601.000000
mean      50.500000     50.500000      0.672020
std       28.723176     28.723176      0.024614
min        1.000000      1.000000      0.586869
25%       25.500000     25.500000      0.659298
50%       50.500000     50.500000      0.676322
75%       75.500000     75.500000      0.686173
max      100.000000    100.000000      0.796247


In [3]:
df['1/WN1'] = 1/df['WN1']
# df['WP1_2'] = df['WP1'] * df['WP1']
df['log WP1'] = np.log(df['WP1'])
print(df['log WP1'])

0        0.000000
1        0.405465
2        0.693147
3        0.916291
4        1.098612
           ...   
39596    4.584967
39597    4.590057
39598    4.595120
39599    4.600158
39600    4.605170
Name: log WP1, Length: 39601, dtype: float64


In [4]:
target_column = ['VTH'] 
exclude_column = ['WN1','WP1']
predictors = list(set(list(df.columns))-set(target_column) - set(list(exclude_column)))

X = df[predictors].values
y = df[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=40)
print(X_train.shape)
print(X_test.shape)

(31680, 2)
(7921, 2)


In [5]:
# Fit Random Forest model
print('>> Random Forest Model')
model_rf = RandomForestRegressor(n_estimators=500, oob_score=True, random_state=100)
model_rf.fit(X_train, y_train.ravel()) 
pred_train_rf= model_rf.predict(X_train)

print('RMSE Train = ', np.sqrt(mean_squared_error(y_train,pred_train_rf)))
print('R2 Train = ', r2_score(y_train, pred_train_rf))

pred_test_rf = model_rf.predict(X_test)
print('RMSE Test = ', np.sqrt(mean_squared_error(y_test,pred_test_rf)))
print('R2 Test = ', r2_score(y_test, pred_test_rf))

>> Random Forest Model
RMSE Train =  0.000117718307021345
R2 Train =  0.9999771478861751
RMSE Test =  0.0003444826521652956
R2 Test =  0.9998033595146382


In [6]:
print(predictors)
print(model_rf.feature_importances_)

['1/WN1', 'log WP1']
[0.3176682 0.6823318]


In [7]:
print('>> Linear Model')
model_linr = linear_model.LinearRegression()
model_linr.fit(X_train, y_train.ravel()) 
pred_train_lr= model_linr.predict(X_train)

print('RMSE Train = ', np.sqrt(mean_squared_error(y_train,pred_train_lr)))
print('R2 Train = ', r2_score(y_train, pred_train_lr))

pred_test_lr = model_linr.predict(X_test)
print('RMSE Test = ', np.sqrt(mean_squared_error(y_test,pred_test_lr)))
print('R2 Test = ', r2_score(y_test, pred_test_lr))

>> Linear Model
RMSE Train =  0.0045370046527866735
R2 Train =  0.9660549218679363
RMSE Test =  0.004346243902786599
R2 Test =  0.9686984088798837


In [8]:
# print('Variables = ',predictors)
# print('Coeff =', model_linr.coef_)
# print('Intercept =', model_linr.intercept_)

print('Equation: VTH = ', end='')
for i in range(0,len(predictors)):
    print( model_linr.coef_[i], '*', predictors[i], ' + ', end='')
print(model_linr.intercept_)

Equation: VTH = 0.12769458850603654 * 1/WN1  + 0.02246644489981312 * log WP1  + 0.5838724488838833
