In [1]:
import warnings
warnings.filterwarnings("ignore")

In [36]:
import pandas as pd
import numpy as np
import json
from pyth.plugins.plaintext.writer import PlaintextWriter 
from pyth.plugins.rtf15.reader import Rtf15Reader 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
import xgboost
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import *

In [3]:
def parse_rtf_json(rtf_file_path): 
    
    with open(rtf_file_path, 'rb') as rtf_file:
        rtf_text = Rtf15Reader.read(rtf_file) 
  
    text = PlaintextWriter.write(rtf_text).getvalue() 
     
    return text 
 
rtf_file_path = "algoparams_from_ui1.json.rtf" 
parsed_text = parse_rtf_json(rtf_file_path) 

json_data = json.loads(parsed_text)

In [4]:
def parse_feature_handling(df,design_json):
    feature_json = design_json['feature_handling']
    for key,value in feature_json.items():
        if value['is_selected'] == True:
            feature_dt = value['feature_variable_type']
            col_name = value['feature_name']
            if feature_dt == "numerical":
                df[col_name] = pd.to_numeric(df[col_name])
            elif feature_dt == "text":
                df[col_name] = df[col_name].astype('str')
            else:
                pass
            for ft_key_det,ft_value_det in value['feature_details'].items():
                if ft_key_det == "rescaling" and ft_value_det != "No rescaling":
                    sc= StandardScaler()
                    df[col_name] = sc.fit_transform(df[[col_name]])
                elif ft_key_det == "make_derived_feats" and ft_value_det != False:
                    print("Feature Extraction Needed i.e derive new feature")
                    break
                elif ft_key_det == "missing_values" and ft_value_det == "Impute" and feature_dt == "numeric":
                    df[col_name].fillna(df[col_name].mean(),inplace=True)
                elif ft_key_det == "missing_values" and ft_value_det == "Impute" and feature_dt == "text":
                    df[col_name].fillna(df[col_name].mode()[0],inplace=True)
                if ft_key_det == "text_handling" and ft_value_det == "Tokenize and hash":
                    df_token = df[col_name].str.split("-",expand=True)
                    df[col_name] = df_token.iloc[:,1]
                    le = LabelEncoder()
                    df[col_name] = le.fit_transform(df[col_name])
        else:
                df.drop(col_name,axis=1,inplace=True)
            
    return df

In [5]:
def eval_metric(y_test, y_pred, problem_type,row_count,feature_count):
    if problem_type == "Classification":
        print("Confusion Matrix")
        print(confusion_matrix(y_test, y_pred))
        print("Classification Report")
        print(classification_report(y_test, y_pred))
    
    elif problem_type == "Regression":
        r2_score = r2_score(y_test, y_pred)
        adj_r2_score = 1-(((1-r2_score)*(row_count-1))/(row_count-feature_count-1))
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        print("R2 Score :",r2_score)
        print("Adjusted R2 Score :",adj_r2_score)
        print("RMSE :",rmse)

In [33]:
def parse_algorithm(x_train, x_test, y_train, y_test,design_json,kf,problem_type,row_count,feature_count):
    algorithm_json = design_json['algorithms']
    model_lst = []
    for key,value in algorithm_json.items():
        if value['is_selected'] == True:
            model_lst.append(key)
            
    print("Algorithm Selected From JSON :",model_lst)
    print()

    for model_name in model_lst:
        if model_name == "RandomForestClassifier":
            param_json = algorithm_json[model_name]

            hparams_rfc = {
                'n_estimators':list(range(param_json['min_trees'],param_json['max_trees']+1)),
                'max_depth':list(range(param_json['min_depth'],param_json['max_depth']+1)),
                'min_samples_leaf':list(range(param_json['min_samples_per_leaf_min_value'],
                                          param_json['min_samples_per_leaf_max_value']+1))
            }

            rfc = RandomForestClassifier()
            gs_rfc = GridSearchCV(estimator=rfc,param_grid=hparams_rfc,
                       scoring='accuracy',cv=kf)
            gs_rfc.fit(x_train,y_train)

            rfc_final = RandomForestClassifier(**gs_rfc.best_params_)
            rfc_final.fit(x_train,y_train)
            y_pred_rfc = rfc_final.predict(x_test)
            print("*******************Random Forest Classifier Result*******************")
            eval_metric(y_test, y_pred_rfc, problem_type,row_count,feature_count)

        elif model_name == "RandomForestRegressor":
            param_json = algorithm_json[model_name]

            hparams_rfr = {
                'n_estimators':list(range(param_json['min_trees'],param_json['max_trees']+1)),
                'max_depth':list(range(param_json['min_depth'],param_json['max_depth']+1)),
                'min_samples_leaf':list(range(param_json['min_samples_per_leaf_min_value'],
                                          param_json['min_samples_per_leaf_max_value']+1))
            }

            rfr = RandomForestRegressor()
            gs_rfr = GridSearchCV(estimator=rfr,param_grid=hparams_rfr,
                       scoring='accuracy',cv=kf)
            gs_rfr.fit(x_train,y_train)

            rfr_final = RandomForestClassifier(**gs_rfr.best_params_)
            rfr_final.fit(x_train,y_train)
            y_pred_rfr = rfr_final.predict(x_test)
            print("*********************Random Forest Regressor Result************************")
            eval_metric(y_test, y_pred_rfr, problem_type,row_count,feature_count)
        
        elif model_name == "DecisionTreeClassifier":
            param_json = algorithm_json[model_name]
            criterion_dtc = []
            if param_json['use_gini'] == True:
                criterion_dtc.append("gini")
            if param_json['use_entropy'] == True:
                criterion_dtc.append("entropy")
            
            splitter_dtc = []
            if param_json['use_best'] == True:
                splitter_dtc.append("best")
            if param_json['use_random'] == True:
                splitter_dtc.append("random")

            hparams_dtc = {
                'criterion':criterion_dtc,
                'max_depth':list(range(param_json['min_depth'],param_json['max_depth']+1)),
                'min_samples_leaf':list(range(param_json['min_samples_per_leaf'][1],
                                          param_json['min_samples_per_leaf'][0]+1)),
                'splitter':splitter_dtc
            }

            dtc = DecisionTreeClassifier()
            gs_dtc = GridSearchCV(estimator=dtc,param_grid=hparams_dtc,
                       scoring='accuracy',cv=kf)
            gs_dtc.fit(x_train,y_train)

            dtc_final = DecisionTreeClassifier(**gs_dtc.best_params_)
            dtc_final.fit(x_train,y_train)
            y_pred_dtc = dtc_final.predict(x_test)
            print("******************Decision Tree Classifier Result************************")
            eval_metric(y_test, y_pred_dtc, problem_type,row_count,feature_count)
            
        elif model_name == "DecisionTreeRegressor":
            param_json = algorithm_json[model_name]
            criterion_dtr = []
            if param_json['use_gini'] == True:
                criterion_dfr.append("gini")
            if param_json['use_entropy'] == True:
                criterion_dtr.append("entropy")
            
            splitter_dtr = []
            if param_json['use_best'] == True:
                splitter_dtr.append("best")
            if param_json['use_random'] == True:
                splitter_dtr.append("random")
                
            # In DecisionTreeRegressor json object criterion for splitting mentioned
            # is gini or entropy but it will not be used. That's why haven't included 
            # in hyperparameter dictionary

            hparams_dtr = {
                'max_depth':list(range(param_json['min_depth'],param_json['max_depth']+1)),
                'min_samples_leaf':list(range(param_json['min_samples_per_leaf'][1],
                                          param_json['min_samples_per_leaf'][0]+1)),
                'splitter':splitter_dtr
            }
        
            dtr = DecisionTreeRegressor()
            gs_dtr = GridSearchCV(estimator=dtr,param_grid=hparams_dtr,
                       scoring='accuracy',cv=kf)
            gs_dtr.fit(x_train,y_train)

            dtr_final = DecisionTreeClassifier(**gs_dtr.best_params_)
            dtr_final.fit(x_train,y_train)
            y_pred_dtr = dtr_final.predict(x_test)
            print("***********************Decision Tree Regressor Result*********************")
            eval_metric(y_test, y_pred_dtr, problem_type,row_count,feature_count)
        else:
            continue
            

In [34]:
def json_parser(json_data):
    design_json = json_data['design_state_data']
    
    df = pd.read_csv(design_json['session_info']['dataset'])
    
    df_preprocess = parse_feature_handling(df,design_json)
    
    target_json = design_json['target']
    problem_type = target_json['prediction_type']
    print(f"****************This is {problem_type} problem***************")
    x = df_preprocess.loc[:,df.columns!=target_json['target']]
    y = df_preprocess[target_json['target']]
    
    train_json = design_json['train']
    if train_json['split'] == "Randomly":
        split_random = True
    else:
        split_random = False
    if train_json['k_fold'] == False:
        kf = 5
    else:
        num_folds = 5
        kf = KFold(n_splits=num_folds, 
                   shuffle=split_random, 
                   random_state=train_json['random_seed'])
    x_train, x_test, y_train, y_test = train_test_split(x,y,
                                                        random_state=train_json['random_seed'],
                                                        train_size=train_json['train_ratio'],
                                                       shuffle=split_random)
    row_count = x.shape[0]
    feature_count = x.shape[1]
    parse_algorithm(x_train, x_test, y_train, y_test,design_json,kf,problem_type,row_count,feature_count)

In [35]:
json_parser(json_data)

****************This is Classification problem***************
Algorithm Selected From JSON : ['RandomForestClassifier', 'DecisionTreeClassifier']

*******************Random Forest Classifier Result*******************
Confusion Matrix
[[10  0  0]
 [ 0 13  0]
 [ 0  0  7]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         7

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

******************Decision Tree Classifier Result************************
Confusion Matrix
[[10  0  0]
 [ 0 13  0]
 [ 0  0  7]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        13
           2 

In [None]:
Ridge()