# Auto ML classifier<br>
*Automated machine learning is the process of automating the tasks of applying machine learning to real-world problems.<br>
AutoML covers the complete pipeline from the raw dataset to the deployable machine learning model.*<br>
### Import the library

In [None]:
from fluidai_net.data.data_class import Data
from fluidai_net.data.data_processor import DataProcessor
from fluidai_net.project_manager.modelreg_helper import ModelRegHelper
from fluidai_net.project_manager.yaml_manager import YamlManager
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import os
import joblib

### Define global variables

In [None]:
input_path = r'/home/coder/PycharmProjects/FRONTIER-MAIN/TEST-SANJIT/TEST-SANJIT_V0/Data Inputs'
output_path = r'/home/coder/PycharmProjects/FRONTIER-MAIN/TEST-SANJIT/TEST-SANJIT_V0/Data Outputs'
model_path = r'/home/coder/PycharmProjects/FRONTIER-MAIN/TEST-SANJIT/TEST-SANJIT_V0/Model Folder'
vec_meta_path = r'/home/coder/PycharmProjects/FRONTIER-MAIN/TEST-SANJIT/TEST-SANJIT_V0/Vec Meta Data'
project_name = r'TEST-SANJIT'
project_version = '0'
model_types = ['xg_boost_classifier', 'logistic_regression']
file_name = 'test.csv'
training_columns = "'sepal_length','sepal_width','petal_length','petal_width'".replace("'", "")
training_columns = training_columns.split(',')
primary_key_col = "".replace("'", "")
target_column = "'species'".replace("'", "")
 # Goble Info varible
model_perf ='Model Performance'
cf_info ='Confusion Matrix'
gini_info ='GINI Score'

### Preprocessing the data

In [None]:
 # create the data object and processor obj
data_obj = Data('test_obj', debug=False)
processor_obj = DataProcessor(data_obj)
 # read the data into memory
try:
    data_obj.read_data_frame_from_path(file_path=os.path.join(input_path, file_name))
except Exception as err:
    print(err)
    data_obj.read_data_frame_from_path(file_path=os.path.join(output_path, file_name))

In [None]:
 # Check the target column is there in main column list and add
if target_column not in training_columns:
    training_columns.append(target_column)
 # save the data as a backup
if primary_key_col != '':
    primary_key_data = data_obj.data_frame[primary_key_col]
    # remove the target column and primary key col
    processor_obj.cleaner.drop_columns(column_values=[primary_key_col])
if primary_key_col in training_columns:
    training_columns.remove(primary_key_col)
original = data_obj.data_frame[training_columns]
processor_obj.cleaner.select_columns(column_values=training_columns)
target_column_data = data_obj.data_frame[target_column]
if target_column in training_columns:
    training_columns.remove(target_column)
 # Identification of columns
 # check if columns are categorical or numerical
categorical_columns = data_obj.data_frame[training_columns].select_dtypes('object')
continuous_columns = data_obj.data_frame[training_columns].select_dtypes('number')
columns_used_to_train = []
 # Processing of the data
 # Impute nan values
processor_obj.stats_calculator.impute_value(columns=continuous_columns)
 # keep only top 10 values
for categorical_col in categorical_columns:
    if data_obj.data_frame[categorical_col].nunique() > 10:
        processor_obj.cleaner.n_largest_categorizer(column_name=categorical_col, n_value=10)
 # Encoding in the data
 # standardize the continuous columns and one hot encode categorical cols
for col in continuous_columns:
    try:
        processor_obj.encoder.standardize(column_name=col, default_vec_value='mean')
        columns_used_to_train.append(col)
    except ZeroDivisionError:
        processor_obj.cleaner.drop_columns(column_values=col)
for col in categorical_columns:
    dummies = pd.get_dummies(data_obj.data_frame[col])
    dummy_cols = [col + '|' + i for i in dummies.columns]
    data_obj.data_frame[dummy_cols] = dummies
    data_obj.data_frame.drop(columns=[col], inplace=True)
    [columns_used_to_train.append(i) for i in dummy_cols]

In [None]:
 # save the encoding logic
complete_data = data_obj.data_frame.drop(columns=[target_column])[columns_used_to_train]
complete_target_data = data_obj.data_frame[target_column]
backup_data = data_obj.data_frame.copy()

split the data into train and test sets

In [None]:
train_test_dict = processor_obj.learner.split_train_test_data(target_column, 0.2, upsample=False)

load the data from the train test dict

In [None]:
x_train = train_test_dict['x_train']
x_test = train_test_dict['x_test']
y_train = train_test_dict['y_train']
y_test = train_test_dict['y_test']
model_train_data = x_train.copy()
model_train_data[target_column] = y_train
model_test_data = x_test.copy()
model_test_data[target_column] = y_test

### Train the Data and performance measurement ....::

In [None]:
for model_type in model_types:
    try:
        print(
            '======================================================================================')
        print('==================== ' + model_type + '====================')

        # Train the auto ml model
        trained_model = getattr(processor_obj.learner, model_type)(x_train=x_train,
                                                                   y_train=y_train)
        model_name = "{0}_{1}_{2}_{3}".format('auto_ml', model_type, file_name.split('.')[0], trained_model["model_name"].split('_')[-1])
        model_saver_object = ModelRegHelper(trained_model=trained_model["model"],
                                            model_type="classification", train_data=model_train_data,
                                            columns_used_to_train=columns_used_to_train, test_data=model_test_data,
                                            target_column=target_column, model_name=model_name, auto=False,
                                            input_path=input_path, output_path=output_path, model_path=model_path)
        model_saver_object.save_classification_or_regression_model()
    except Exception as err:
        warnings.warn('Unable to build Model ' + model_type)
        warnings.warn(str(err))