In [1]:
import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

from sklearn.svm import SVR

from sklearn.tree import DecisionTreeRegressor

from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, r2_score

### 1. reading given json file`

In [2]:
algoparams = pd.read_json("algoparams_from_ui.json")
algoparams

Unnamed: 0,session_name,session_description,design_state_data
algorithms,test,test,{'RandomForestClassifier': {'model_name': 'Ran...
feature_generation,test,test,"{'linear_interactions': [['petal_length', 'sep..."
feature_handling,test,test,{'sepal_length': {'feature_name': 'sepal_lengt...
feature_reduction,test,test,"{'feature_reduction_method': 'Tree-based', 'nu..."
hyperparameters,test,test,"{'stratergy': 'Grid Search', 'shuffle_grid': T..."
metrics,test,test,"{'optomize_model_hyperparameters_for': 'AUC', ..."
probability_calibration,test,test,{'probability_calibration_method': 'Sigmoid - ...
session_info,test,test,"{'project_id': '1', 'experiment_id': 'kkkk-11'..."
target,test,test,"{'prediction_type': 'Regression', 'target': 'p..."
train,test,test,"{'policy': 'Split the dataset', 'time_variable..."


In [3]:
algoparams['design_state_data']['target']

{'prediction_type': 'Regression',
 'target': 'petal_width',
 'type': 'regression',
 'partitioning': True}

In [4]:
algoparams.loc['hyperparameters']['design_state_data']

{'stratergy': 'Grid Search',
 'shuffle_grid': True,
 'random_state': 1,
 'max_iterations': 2,
 'max_search_time': 3,
 'parallelism': 5,
 'cross_validation_stratergy': 'Time-based K-fold(with overlap)',
 'num_of_folds': 6,
 'split_ratio': 0,
 'stratified': True}

### 2. reading iris.csv and finding out missing imputation needs to be applied

In [5]:
df = pd.read_csv("iris.csv")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
# finding missing values
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

##### No null values found. So no need to do any imputation

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


### Handling categorical features

In [8]:
df = pd.get_dummies(df)

In [9]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_Iris-setosa,species_Iris-versicolor,species_Iris-virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,5.2,2.0,0,0,1
148,6.2,3.4,5.4,2.3,0,0,1


### 3. making model objects using sklearn that is required for prediction type

### Dividing data into target variable and features

In [10]:
X = df.drop('petal_width',axis=1)

In [11]:
y= df['petal_width']

### Splitting data into train and test

In [12]:
############################ SPLITTING TWICE ####################### 

# 70% of data is training data, set aside other 30%
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=42)

# Remaining 30% is split into evaluation and test sets
# Each is 15% of the original data size
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=42)

In [13]:
# SCALE DATA

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_eval = scaler.transform(X_eval)
X_test = scaler.transform(X_test)

### Trying different Algorithms to find predictions

In [14]:
models = {'RandomForestRegressor':RandomForestRegressor(),
          'LinearRegression':LinearRegression(),
          'RidgeRegression':Ridge(),
          'LassoRegression':Lasso(),
          'ElasticNetRegression':ElasticNet(),
          'SVR':SVR(),
          'DecisionTreeRegressor':DecisionTreeRegressor(),
}

In [15]:
n = len(df)
k = len(X.columns)

In [29]:
for i in range(len(list(models))):
    print("*"*10,list(models.keys())[i],"*"*10)
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    y_pred_train = model.predict(X_train)
    y_pred_eval = model.predict(X_eval)
    
    # Training Evaluation
    mse_train = mean_squared_error(y_train, y_pred_train)
    r2score_train = r2_score(y_train, y_pred_train)
    adj_r2score_train = 1 - ((1-r2score_train)*(n-1)/(n-k-1))
    
    # Other Data Evaluation  
    mse_other = mean_squared_error(y_eval, y_pred_eval)
    r2score_other = r2_score(y_eval, y_pred_eval)
    adj_r2score_other = 1 - ((1-r2score_other)*(n-1)/(n-k-1))
    
    print(f"Training Evaluation")
    print(f"Mean Squarred Error: {mse_train}")
    print(f"R squarred : {r2score_train}")
    print(f"Adjusted R Squarred : {adj_r2score_train}")
    
    print()
    
    print(f"Evaluation for Other Data")
    print(f"Mean Squarred Error: {mse_other}")
    print(f"R squarred : {r2score_other}")
    print(f"Adjusted R Squarred : {adj_r2score_other}")
    print()   


********** RandomForestRegressor **********
Training Evaluation
Mean Squarred Error: 0.005465882626984113
R squarred : 0.9900340093005275
Adjusted R Squarred : 0.9896158558446057

Evaluation for Other Data
Mean Squarred Error: 0.030100519558080772
R squarred : 0.9547613604952456
Adjusted R Squarred : 0.9528632357607804

********** LinearRegression **********
Training Evaluation
Mean Squarred Error: 0.028252163140960157
R squarred : 0.9484875884980095
Adjusted R Squarred : 0.946326228574849

Evaluation for Other Data
Mean Squarred Error: 0.020725352010714817
R squarred : 0.9688514769184388
Adjusted R Squarred : 0.9675445458800516

********** RidgeRegression **********
Training Evaluation
Mean Squarred Error: 0.028474082368576747
R squarred : 0.9480829612658874
Adjusted R Squarred : 0.9459046239763442

Evaluation for Other Data
Mean Squarred Error: 0.018931769319775124
R squarred : 0.9715470862291294
Adjusted R Squarred : 0.9703532576793027

********** LassoRegression **********
Training

### 4. Hyper Parameter Tuning for Random Forest Algorithm

In [30]:
prediction_data = algoparams['design_state_data']['algorithms']

In [31]:
prediction_data['RandomForestRegressor']

{'model_name': 'Random Forest Regressor',
 'is_selected': True,
 'min_trees': 10,
 'max_trees': 20,
 'feature_sampling_statergy': 'Default',
 'min_depth': 20,
 'max_depth': 25,
 'min_samples_per_leaf_min_value': 5,
 'min_samples_per_leaf_max_value': 10,
 'parallelism': 0}

In [32]:
rfr = RandomForestRegressor()

In [33]:
param_grid = {"n_estimators":list(range(10,21)),
              "max_depth" : list(range(20,26)),
              "min_samples_leaf": list(range(5,11))
    }

In [34]:
grid_rfr = GridSearchCV(rfr, param_grid,cv=6)

In [35]:
grid_rfr.fit(X_train, y_train)

In [36]:
grid_rfr.best_params_

{'max_depth': 22, 'min_samples_leaf': 6, 'n_estimators': 11}

In [37]:
grid_rfr.best_estimator_

### 5. Final Model Metrics

##### Final prediction using best model for Test Data (Not Used for evaluation before)

In [38]:
y_pred = grid_rfr.predict(X_test)

In [39]:
mean_squared_error(y_test,y_pred)

0.027939665574402823

In [40]:
r2score_test = r2_score(y_test,y_pred)
adj_r2score_test = 1 - ((1-r2score_test)*(n-1)/(n-k-1))

In [41]:
print(f"R squarred : {r2score_test}")
print(f"Adjusted R Squarred : {adj_r2score_test}")

R squarred : 0.9464761241078471
Adjusted R Squarred : 0.9442303670774072
