In [1]:
import json
import numpy as np
import pandas as pd
import ast

In [57]:
from striprtf.striprtf import rtf_to_text
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

Getting the Iris data as dataFrame

In [3]:
df = pd.read_csv('iris.csv')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


Getting the information out of the 'algoparams_from_ui.json.rtf' file as a dictionary

In [46]:
fileName = "algoparams_from_ui.json.rtf"
with open(fileName, 'r', encoding='utf-8') as f:
    rtfFile = f.read()
    textFile = rtf_to_text(rtfFile)
    jsonData = json.loads(textFile)

In [47]:
for key in jsonData['design_state_data']:
    print(key)

session_info
target
train
metrics
feature_handling
feature_generation
feature_reduction
hyperparameters
weighting_stratergy
probability_calibration
algorithms


Getting the target and type for the model 

In [48]:
jsonData['design_state_data']['target']

{'prediction_type': 'Regression',
 'target': 'petal_width',
 'type': 'regression',
 'partitioning': True}

In [49]:
predictionType = jsonData['design_state_data']['target']['prediction_type']
target = jsonData['design_state_data']['target']['target']
type = jsonData['design_state_data']['target']['type']
predictionType, target, type

('Regression', 'petal_width', 'regression')

Feature Handling

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [9]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [10]:
featureHandling = jsonData['design_state_data']['feature_handling']
for key in featureHandling:
    featureDetails = featureHandling[key]['feature_details']
    #For Numerical attributes, Impute
    if featureHandling[key]['feature_variable_type'] == 'numerical':
        #Imputing 'impute value' with 'Average of the attributes value'
        if featureDetails['impute_with'] == 'Average of values':
            mean = df[key].mean()
            df[key].replace(featureDetails['impute_value'], mean)
        else:
        #Imputing 'impute value' with 'custom value' say 1
            df[key].replace(featureDetails['impute_value'], 1)
    else:
    #For categorical attribute, do a Ordinal Encoding
        ordEnc = OrdinalEncoder()
        df[key] = ordEnc.fit_transform(df[[key]]).astype(int)

New DataFrame after doing feature handling

In [11]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


Prediction

Getting the X and Y value

In [37]:
xCol = list(df.columns)
xCol.remove(target)
X = df[xCol]
y = df[target]

If Prediction type is Regression, append all the regression model with its corresponding argumenets into the models dictionary

In [64]:
models = {}
if predictionType == 'Regression':
    
    randomForest = jsonData['design_state_data']['algorithms']['RandomForestRegressor']
    models['randomForest'] = RandomForestRegressor(n_estimators = randomForest['max_trees'],
                                                  max_depth = randomForest['max_depth'], 
                                                  min_samples_leaf = randomForest['min_samples_per_leaf_min_value'])

    gbt = jsonData['design_state_data']['algorithms']['GBTRegressor']
    models['GBT Regressor'] = GradientBoostingRegressor(learning_rate=gbt['max_stepsize'],
                                                       n_estimators = gbt['num_of_BoostingStages'][1], 
                                                       max_depth = gbt['max_depth'],
                                                       n_iter_no_change = gbt['min_iter'])
    
    linearRegression = jsonData['design_state_data']['algorithms']['LinearRegression']
    models['linearRegression'] = LinearRegression()


    RidgeRegression = jsonData['design_state_data']['algorithms']['RidgeRegression']
    models['RidgeRegression'] = Ridge(alpha = RidgeRegression['max_regparam'],
                                     max_iter = RidgeRegression['max_iter'])
    

    LassoRegression = jsonData['design_state_data']['algorithms']['LassoRegression']
    models['LassoRegression'] = Lasso(alpha = LassoRegression['max_regparam'],
                                     max_iter = LassoRegression['max_iter'])

    ElasticNetRegression = jsonData['design_state_data']['algorithms']['ElasticNetRegression']
    models['ElasticNetRegression'] = ElasticNet(alpha = ElasticNetRegression['max_regparam'], 
                                               l1_ratio = ElasticNetRegression['max_elasticnet'],
                                               max_iter = ElasticNetRegression['max_iter'])
    

    DecisionTree = jsonData['design_state_data']['algorithms']['DecisionTreeRegressor']
    models['DecisionTreeRegressor'] = DecisionTreeRegressor(max_depth = DecisionTree['max_depth'],
                                                           min_samples_leaf = DecisionTree['min_samples_per_leaf'][0])


    sgd = jsonData['design_state_data']['algorithms']['SGD']
    models['SGD'] = SGDRegressor()


    neural_network = jsonData['design_state_data']['algorithms']['neural_network']
    models['neural_network'] = MLPRegressor(hidden_layer_sizes=(neural_network['hidden_layer_sizes'][0], neural_network['hidden_layer_sizes'][1]))

Fitting the data and getting the score for each model

In [65]:
for modelName in models:
    models[modelName].fit(X, y)
    y_pred = models[modelName].predict(X)
    print('Model: %s, Error: %2.5f' % (modelName, mean_squared_error(y, y_pred)))

Model: randomForest, Error: 0.01985
Model: GBT Regressor, Error: 0.00658
Model: linearRegression, Error: 0.02784
Model: RidgeRegression, Error: 0.02789
Model: LassoRegression, Error: 0.24925
Model: ElasticNetRegression, Error: 0.18826
Model: DecisionTreeRegressor, Error: 0.02320
Model: SGD, Error: 0.03582
Model: neural_network, Error: 0.02978


Ranking them based on the Mean Square Error for the given arguments:
1. GBT Regressor
2. Random Forest
3. Decision Tree Regressor
4. Lasso Regression
5. Linear Regression
6. Ridge Rergression
7. Neural Network
8. SGD Regressor
9. Elastic Net Regression