# Polynomial Regression Model

## import libs

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

import numpy as np
import pandas as pd

## read data base

In [2]:
data = pd.read_csv('CLEANED_sobolsampling-2048.csv')
#print(f'{data}')
data = data.drop(data.columns[0], axis=1)
#data = data.transpose()
print(f'{data}')

          SigC      SigH      EpsC      EpsH  density
0     0.329511  0.074231  1.036268  0.062599  891.797
1     0.193948  0.136800  0.751676  0.037376  664.747
2     0.230588  0.235899  0.596203  0.100129  682.448
3     0.242166  0.148891  0.406759  0.087550  688.667
4     0.128757  0.190416  0.989670  0.034993  660.728
...        ...       ...       ...       ...      ...
2000  0.128595  0.136526  0.645525  0.075425  651.258
2001  0.242035  0.235637  0.735539  0.137124  708.349
2002  0.230760  0.148096  1.005841  0.111508  714.162
2003  0.066912  0.095994  0.782125  0.108488  651.054
2004  0.329646  0.259110  0.444702  0.043977  707.088

[2005 rows x 5 columns]


## prepare data

In [3]:
X = data.drop('density', axis=1)
print(f'{X}')

Y = data['density']
print(f'{Y}')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05, random_state=29)

          SigC      SigH      EpsC      EpsH
0     0.329511  0.074231  1.036268  0.062599
1     0.193948  0.136800  0.751676  0.037376
2     0.230588  0.235899  0.596203  0.100129
3     0.242166  0.148891  0.406759  0.087550
4     0.128757  0.190416  0.989670  0.034993
...        ...       ...       ...       ...
2000  0.128595  0.136526  0.645525  0.075425
2001  0.242035  0.235637  0.735539  0.137124
2002  0.230760  0.148096  1.005841  0.111508
2003  0.066912  0.095994  0.782125  0.108488
2004  0.329646  0.259110  0.444702  0.043977

[2005 rows x 4 columns]
0       891.797
1       664.747
2       682.448
3       688.667
4       660.728
         ...   
2000    651.258
2001    708.349
2002    714.162
2003    651.054
2004    707.088
Name: density, Length: 2005, dtype: float64


## create and train the model

In [6]:
## creation the pipeline
model = make_pipeline(PolynomialFeatures(degree=8), LinearRegression())

## train/fit the model
model.fit(X_train, Y_train)

## prediction using the test set
Y_prediction = model.predict(X_test)

## evaluate with Y_test
rmse = np.sqrt(mean_squared_error(Y_test, Y_prediction))
r2 = r2_score(Y_test, Y_prediction)

print(f'RMSE: {rmse}')
print(f'R2: {r2}')

RMSE: 7.230447578639616
R2: 0.9958662031055541


## Result

- RMSE: 7.230447578639616
- R2: 0.9958662031055541

  genereally: 0 <= R2 <= 1, with R2 -> 1: better matching of Y_prediction to Y_test. Pretty good?
  how about higher/lower degrees?

In [7]:
print(f'Y_test:\n{Y_test}')
print(f'Y_prediction:\n{Y_prediction}')

Y_test:
1659    659.137
1424    688.522
880     660.425
30      480.165
1373    637.540
         ...   
1273    652.283
1447    717.095
1451    838.270
977     340.499
459     856.005
Name: density, Length: 101, dtype: float64
Y_prediction:
[658.8642895  686.74013955 660.39053593 479.61465791 634.93666007
 771.64918106 552.34976631 409.28779996 661.58162803 664.13368648
 757.73301915 670.12291785 694.99508331 793.94891023 824.24376015
 652.60038798 652.86713005 664.10778164 685.12283908 648.80135188
 727.55544386 668.51014288 650.96996721 482.69122974 649.18518635
 405.79315639 653.78704361 649.00007481 670.21569238 595.99777836
 695.01977088 688.02774452 475.33255921 809.83353074 798.82166454
 829.69057816 716.92491793 665.29375252 645.15325613 688.26841468
 670.64411858 647.67028548 333.08031198 789.67824944 800.16163968
 696.8366267  649.62014311 678.67022494 654.45913793 876.38599345
 647.33230804 675.2319124  670.85809917 649.10740629 653.54614741
 528.0799031  656.58594339 649.56