<a href="https://colab.research.google.com/github/riyabiswas196/ml_pipline/blob/main/pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, ElasticNet, SGDClassifier
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score, classification_report


In [None]:
df=pd.read_csv("iris.csv")
df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
f= open("algoparams_from_ui.json","r")
json_data= json.load(f)
data= json_data["design_state_data"]
target = data["target"]["target"]

prediction_type = data["target"]["prediction_type"]


In [None]:
features = data["feature_handling"]
f_names=[]

for name, details in features.items():
  f_names.append(name)
  if details["feature_name"]=="species":
        continue
  if details["feature_details"]["missing_values"] == "Impute":
    if details["feature_details"]["impute_with"] == "Average of values":
      df[name].fillna(df[name].mean(), inplace=True)
    elif details["feature_details"]["impute_with"] == "Median of values":
      df[name].fillna(df[name].median(), inplace=True)
    elif details["feature_details"]["impute_with"] == "Mode of values":
      df[name].fillna(df[name].mode()[0], inplace=True)
print(f_names)


['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']


In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
feature_reduction = data["feature_reduction"]
if type(feature_reduction["depth_of_trees"])!="list":
  depth_of_trees = int(feature_reduction["depth_of_trees"])
  max_depth_list = []
  max_depth_list.append(depth_of_trees)
if type(feature_reduction["num_of_features_to_keep"])!="list":
  max_features=int(feature_reduction["num_of_features_to_keep"])
  max_feat=[]
  max_feat.append(max_features)

if feature_reduction["feature_reduction_method"] == "No Reduction":
    X = df[f_names]

elif feature_reduction["feature_reduction_method"] == "Corr with Target":
    X = df[f_names].corrwith(df[target]).abs().sort_values(ascending=False).head(10).index.tolist()

elif feature_reduction["feature_reduction_method"] == "Tree-based":
    X = df[f_names].corrwith(df[target], numeric_only=True).abs().sort_values(ascending=False).head(10).index.tolist()
    tree_regressor = DecisionTreeRegressor()
    tree_regressor_params = {"max_depth":max_depth_list,"max_features":max_feat,"random_state":[1]}
    tree_regressor_grid = GridSearchCV(tree_regressor, tree_regressor_params, cv=5)
    tree_regressor_grid.fit(df[X], df[target])
    X = tree_regressor_grid.best_estimator_.feature_importances_.argsort()[::-1][:10].tolist()

elif feature_reduction["feature_reduction_method"] == "PCA":
    pca = PCA(n_components=max_feat[0],)
    X = pca.fit_transform(df[f_names])


In [None]:
df.isna().sum()

In [None]:
models=[]
model_param_grid={
    "Random Forest Classifier":{
        "n_estimators": [data["algorithms"]["RandomForestClassifier"]["min_trees"]],
        "max_depth": [data["algorithms"]["RandomForestClassifier"]["max_depth"]],
        "min_samples_leaf": [data["algorithms"]["RandomForestClassifier"]["min_samples_per_leaf_min_value"]],
        "n_jobs": [data["algorithms"]["RandomForestClassifier"]["parallelism"]]
        },
    "Random Forest Regressor": {
        "n_estimators": [data["algorithms"]["RandomForestRegressor"]["min_trees"]],
        "max_depth": [data["algorithms"]["RandomForestRegressor"]["max_depth"]],
        "min_samples_leaf": [data["algorithms"]["RandomForestRegressor"]["min_samples_per_leaf_min_value"]],
        "n_jobs": [data["algorithms"]["RandomForestRegressor"]["parallelism"]]
        },
    "Gradient Boosting Classifier":{
        "n_estimators": [data["algorithms"]["GBTClassifier"]["num_of_BoostingStages"][0]],
        "learning_rate": [data["algorithms"]["GBTClassifier"]["learningRate"]],
        "subsample": [data["algorithms"]["GBTClassifier"]["min_subsample"]],
        "min_samples_split": [data["algorithms"]["GBTClassifier"]["min_depth"]],
        "max_depth": [data["algorithms"]["GBTClassifier"]["max_depth"]]
        },
    "Gradient Boosting Regressor":{
        "n_estimators": [data["algorithms"]["GBTRegressor"]["num_of_BoostingStages"][0]],
        "subsample": [data["algorithms"]["GBTRegressor"]["min_subsample"]],
        "min_samples_split": [data["algorithms"]["GBTRegressor"]["min_depth"]],
        "max_depth": [data["algorithms"]["GBTRegressor"]["max_depth"]]
        },
    "Linear Regression" : [{
        "n_jobs": [data["algorithms"]["LinearRegression"]["parallelism"]],
        "max_iter": list(range(30, 51)),  # min_iter and max_iter from the JSON script
        'reg_alpha': [0.5, 0.6, 0.7, 0.8],  # min_regparam and max_regparam from the JSON script
        'l1_ratio': [0.5, 0.6, 0.7, 0.8],
        }],
    "Logistic Regression" : {
        "n_jobs": [data["algorithms"]["LinearRegression"]["parallelism"]],
        "reg_alpha": [np.linspace(data["algorithms"]["LogisticRegression"]["min_regparam"], data["algorithms"]["LogisticRegression"]["max_regparam"], num=5)],
        "l1_ratio": [np.linspace(data["algorithms"]["LogisticRegression"]["min_elasticnet"], data["algorithms"]["LogisticRegression"]["max_elasticnet"], num=5)],
        "max_iter": [np.arange(data["algorithms"]["LogisticRegression"]["min_iter"], data["algorithms"]["LogisticRegression"]["max_iter"] + 1)]
        },
    "Ridge Regression": {
        "reg_alpha": [np.linspace(data["algorithms"]["RidgeRegression"]["min_regparam"], data["algorithms"]["RidgeRegression"]["max_regparam"], num=5)],
        "max_iter": [np.arange(data["algorithms"]["RidgeRegression"]["min_iter"], data["algorithms"]["RidgeRegression"]["max_iter"] + 1)]
        },
    "Lasso Regression": {
        "reg_alpha": [np.linspace(data["algorithms"]["LassoRegression"]["min_regparam"], data["algorithms"]["LassoRegression"]["max_regparam"], num=5)],
        "max_iter": [np.arange(data["algorithms"]["LassoRegression"]["min_iter"], data["algorithms"]["LassoRegression"]["max_iter"] + 1)]
        },
    "Elastic Net Regression" : {
        "reg_alpha": [np.linspace(data["algorithms"]["ElasticNetRegression"]["min_regparam"], data["algorithms"]["ElasticNetRegression"]["max_regparam"], num=5)],
        "l1_ratio": [np.linspace(data["algorithms"]["ElasticNetRegression"]["min_elasticnet"], data["algorithms"]["ElasticNetRegression"]["max_elasticnet"], num=5)],
        "max_iter": [np.arange(data["algorithms"]["ElasticNetRegression"]["min_iter"], data["algorithms"]["ElasticNetRegression"]["max_iter"] + 1)]
        },
    "XGB Regressor": {
        "objective":"reg:squarederror",
        "use_label_encoder": False,
        "random_state": data["algorithms"]["xg_boost"]["random_state"],
        "n_estimators": [data["algorithms"]["xg_boost"]["parallelism"]],
        "early_stopping_rounds": [data["algorithms"]["xg_boost"]["early_stopping_rounds"]],
        "max_depth": [data["algorithms"]["xg_boost"]["max_depth_of_tree"][0]],
        "learning_rate": [data["algorithms"]["xg_boost"]["learningRate"][0]],
        "reg_alpha": [data["algorithms"]["xg_boost"]["l1_regularization"][0]],
        "reg_lambda": [data["algorithms"]["xg_boost"]["l2_regularization"][0]],
        "gamma": [data["algorithms"]["xg_boost"]["gamma"][0]],
        "min_child_weight": [data["algorithms"]["xg_boost"]["min_child_weight"][0]],
        "subsample": [data["algorithms"]["xg_boost"]["sub_sample"][0]],
        "colsample_bytree": [data["algorithms"]["xg_boost"]["col_sample_by_tree"][0]],
        },
    "Decision Tree Regressor": {
        "criterion":"gini" if data["algorithms"]["DecisionTreeRegressor"]["use_gini"] else "entropy",
        "splitter": "best" if data["algorithms"]["DecisionTreeRegressor"]["use_best"] else "random",
        "min_samples_leaf": [data["algorithms"]["DecisionTreeRegressor"]["min_samples_per_leaf"]],
        "max_depth": [data["algorithms"]["DecisionTreeRegressor"]["max_depth"]]
        },
    "Decision Tree Classifier": {
        "criterion":"gini" if data["algorithms"]["DecisionTreeClassifier"]["use_gini"] else "entropy",
        "splitter": "best" if data["algorithms"]["DecisionTreeClassifier"]["use_best"] else "random",
        "min_samples_leaf": [data["algorithms"]["DecisionTreeClassifier"]["min_samples_per_leaf"]],
        "max_depth": [data["algorithms"]["DecisionTreeClassifier"]["max_depth"]]
        },
    "Support Vector Machine": {
        "kernel": ["linear", "rbf", "poly", "sigmoid"],
        "C": [data["algorithms"]["SVM"]["c_value"]],
        "gamma": ["scale", "auto"] if data["algorithms"]["SVM"]["rep_kernel"]=="true" else [0.1],
        "tol": [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        "max_iter": [data["algorithms"]["SVM"]["max_iterations"]]
        },
    "Stochastic Gradient Descent" : {
        "loss": "log" if data["algorithms"]["SGD"]["use_logistics"] else "hinge",
        "penalty": "elasticnet" if data["algorithms"]["SGD"]["use_elastic_net_regularization"] else "l2",
        "alpha": [data["algorithms"]["SGD"]["alpha_value"]],
        "l1_ratio": 0.5,  # Only relevant if using elastic net regularization
        "max_iter": [data["algorithms"]["SGD"]["max_iterations"] if data["algorithms"]["SGD"]["max_iterations"] else 1000],
        "tol": [data["algorithms"]["SGD"]["tolerance"]],
        "random_state": 0,
        "n_jobs": [data["algorithms"]["SGD"]["parallelism"]]
        },
     "KNN": {
        "n_neighbors": [data["algorithms"]["KNN"]["k_value"][0]],
        "weights": "distance" if data["algorithms"]["KNN"]["distance_weighting"] else "uniform",
        "algorithm": [data["algorithms"]["KNN"]["neighbour_finding_algorithm"]],
        "p": [data["algorithms"]["KNN"]["p_value"]]
        },
    "Extra Random Trees": {
        "n_estimators": [data["algorithms"]["extra_random_trees"]["num_of_trees"]],
        "max_depth": [data["algorithms"]["extra_random_trees"]["max_depth"]],
        "min_samples_leaf": [data["algorithms"]["extra_random_trees"]["min_samples_per_leaf"]]
        },
    "Neural Network" : {
        "hidden_layer_sizes": [data["algorithms"]["neural_network"]["hidden_layer_sizes"]],
        "activation": ["relu", "tanh"],
        "solver": ["adam"],
        "early_stopping": [True],
        "shuffle": [True],
        }

}

In [None]:
if data["target"]["prediction_type"] == "Regression":
    models.append(("Linear Regression", LinearRegression()))
    models.append(("Random Forest Regressor", RandomForestRegressor()))
    models.append(("Gradient Boosting Regressor", GradientBoostingRegressor()))
    models.append(("Ridge Regression", Ridge()))
    models.append(("Lasso Regresstion", Lasso()))
    models.append(("Elastic Net Regression", ElasticNet()))
    models.append(("XGB Regressor", XGBRegressor()))
    models.append(("Decision Tree Regressor", DecisionTreeRegressor()))


elif data["target"]["prediction_type"] == "Classification":
    models.append(("Random Forest Classifier", RandomForestClassifier()))
    models.append(("Gradient Boosting Classifier", GradientBoostingClassifier()))
    models.append(("Logistic Regression", LogisticRegression()))
    models.append(("Random Forest Classifier", RandomForestClassifier()))
    models.append(("Decision Tree Classifier", DecisionTreeClassifier()))
    models.append(("Support Vector Machine", SVC()))
    models.append(("Stochastic Gradient Descent",SGDClassifier()))
    models.append(("KNN",KNeighborsClassifier()))
    models.append(("Extra Random Trees",ExtraTreesClassifier()))
    models.append(("Neural Network", MLPClassifier()))

In [None]:
x=df[[features for features in f_names if features!=target and features!='species']]
y=df[target]

In [None]:
if data["train"]["policy"] == "Split the dataset":
    if data["train"]["split"] == "Randomly":
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=data["train"]["random_seed"])
    else:
        pass
X_train

Unnamed: 0,sepal_length,sepal_width,petal_length
69,5.6,2.5,3.9
135,7.7,3.0,6.1
56,6.3,3.3,4.7
80,5.5,2.4,3.8
123,6.3,2.7,4.9
...,...,...,...
9,4.9,3.1,1.5
103,6.3,2.9,5.6
67,5.8,2.7,4.1
117,7.7,3.8,6.7


In [None]:
models

[('Linear Regression', LinearRegression()),
 ('Random Forest Regressor', RandomForestRegressor()),
 ('Gradient Boosting Regressor', GradientBoostingRegressor()),
 ('Ridge Regression', Ridge()),
 ('Lasso Regresstion', Lasso()),
 ('Elastic Net Regression', ElasticNet()),
 ('XGB Regressor',
  XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               n_estimators=100, n_jobs=None, num_parallel_tree=None,
   

In [None]:
for name, model in models:
    pipeline = Pipeline(steps=[
        ("scaler", StandardScaler()),
        #('vectorizer', CountVectorizer())
        ("model", model)
    ])
    param_grid = model_param_grid.get(name)
    grid = GridSearchCV(pipeline, param_grid=param_grid , cv=data["hyperparameters"]["num_of_folds"], n_jobs=data["hyperparameters"]["parallelism"])

    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    y_pred
    mse = mean_squared_error(X_test, y_test)
    r2 = r2_score(X_test, y_test)
    auc = roc_auc_score(X_test, y_test)
    print("{} MSE: {}, R2: {}, AUC: {}".format(name,mse,r2,auc))
    print(classification_report(y_test, y_pred))


ValueError: ignored