# Importing required libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import RFECV
import math
from sklearn.ensemble import RandomForestRegressor

# Importing data

In [None]:
data = pd.read_csv('Data.csv')
data.head()

#EDA

In [None]:
Data.info()

In [None]:
Data.describe().T

In [None]:
Data.isnull().sum()

# Data spliting

* Training Data = 80% of dataset
* Validation Data = 10% of dataset
* Test Data = 10% of dataset

In [None]:
train_df, rem_df = train_test_split(Data, train_size=0.8)

val_df, test_df = train_test_split(rem_df, test_size=0.5)

In [None]:
X_train = train_df.drop(columns=['DG (298K)'])
y_train = train_df['DG (298K)']

X_val = val_df.drop(columns=['DG (298K)'])
y_val = val_df['DG (298K)']

X_test = test_df.drop(columns=['DG (298K)'])
y_test = test_df['DG (298K)']

# Random Forest

In [None]:
model = RandomForestRegressor()

## Feature selection

In [None]:
rfecv = RFECV(estimator= model, step = 5, cv = 5, scoring='neg_mean_squared_error')
rfecv = rfecv.fit(X_train, y_train)

print(f'The optimal number of features:{rfecv.n_features_}')
print(f'Selected features: {list(X_train.columns[rfecv.support_])}')
best_features = list(X_train.columns[rfecv.support_])

In [None]:
X_train_new = X_train[best_features]
X_val_new = X_val[best_features]
X_test_new = X_test[best_features]

## Hyperparameter tuning

In [None]:
params = {'n_estimators': list(np.arange(60, 510, 10)), 'max_depth': [5, 10, 15, 20, 25, 30, 35, 40]}

HpTRF = GridSearchCV(model, params)

HpTRF.fit(X_train, y_train)

## Model training & prediction

In [None]:
model = RandomForestRegressor(n_estimators = HpTRF.best_params_['n_estimators'], max_depth = HpTRF.best_params_['max_depth'])

model.fit(X_train, y_train)

y_pred_train_RF = model.predict(X_train_new)
y_pred_val_RF = model.predict(X_val_new)
y_pred_test_RF = model.predict(X_test_new)

## Results

In [None]:
mse_train_RF = mean_squared_error(y_train, y_pred_train_RF)
mse_val_RF = mean_squared_error(y_val, y_pred_val_RF)
mse_test_RF = mean_squared_error(y_test, y_pred_test_RF)
print("Root Mean Squared error on Train data is:",math.sqrt(mse_train_RF))
print("Root Mean Squared error on Val data is:",math.sqrt(mse_val_RF))

In [None]:
r2_train_RF = r2_score(y_train, y_pred_train_RF)
r2_val_RF = r2_score(y_val, y_pred_val_RF)
r2_test_RF = r2_score(y_test, y_pred_test_RF)
print('r2 score for train is', r2_train_RF)
print('r2 score for val is', r2_val_RF)
print('r2 score for test is', r2_test_RF)

In [None]:
print('Mean Absolute Error Train:', mean_absolute_error(y_train, y_pred_train_RF))
print('Mean Absolute Error Val:', mean_absolute_error(y_val, y_pred_val_RF))
print('Mean Absolute Error Test:', mean_absolute_error(y_test, y_pred_test_RF))

In [None]:
sns.residplot(x = y_test, y = y_pred_test_RF)

In [None]:
sns.regplot(x = y_test, y = y_pred_test_RF)

# MLP

In [None]:
from tensorflow.keras import activations, losses, optimizers
from keras.layers import InputLayer, Dense
from tensorflow.keras.models import Sequential
import tensorflow as tf

## Model consctruction

In [None]:
model = Sequential([
    InputLayer(input_shape = (None, X_train_new.shape[1])),
    Dense(units = 20),
    Dense(units = 30),
    Dense(units = 15),
    Dense(units = 1, activation = 'linear')
])

model.compile(optimizer = keras.optimizers.experimental.Adam(0.1),
                loss = tf.keras.losses.MeanSquaredError())

## Model training & prediction

In [None]:
model.fit(X_train_new, y_train, epochs = 500)

y_pred_train_MLP = model.predict(X_train_new)
y_pred_val_MLP = model.predict(X_val_new)
y_pred_MLP = model.predict(X_test_new)

## Results

In [None]:
mse_train_MLP=mean_squared_error(y_train, y_pred_train_MLP)
mse_val_MLP=mean_squared_error(y_val, y_pred_val_MLP)
mse_test=mean_squared_error(y_test, y_pred_MLP)

r2_train_MLP = r2_score(y_train, y_pred_train_MLP)
r2_val_MLP = r2_score(y_val, y_pred_val_MLP)
r2_test_MLP = r2_score(y_test, y_pred_MLP)

print("Root Mean Squared error on Train data is:",math.sqrt(mse_train_MLP))
print("Root Mean Squared error on Val data is:",math.sqrt(mse_val_MLP))
print('r2 score for train is', r2_train_MLP)
print('r2 score for val is', r2_val_MLP)
print("Root Mean Squared error on Test data is:",math.sqrt(mse_test))
print('r2 score for test is', r2_test_MLP)

In [None]:
sns.residplot(x = y_test, y = y_pred_MLP)

In [None]:
sns.regplot(x = y_test, y = y_pred_MLP)