In [1]:
import pandas as pd
import joblib
import modeltraining as mt
import numpy as np
import mlflow
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("./data/Twitter_Data.csv")

In [3]:
from sklearn.ensemble import GradientBoostingClassifier

## Run First Experiment using GBM

In [4]:
EXPERIMENT_NAME = "mlflow-GBM"
EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

In [14]:
for idx, tree in enumerate([50, 100, 150,200]):
    for lr in [0.1, 0.05, 0.001]:
        
        gbm = GradientBoostingClassifier(n_estimators=tree, learning_rate=lr)
        vectorize, accuracy = mt.train_model(corpus=data["text"], target=data["sentiment"], model=gbm, model_name="GBM")
        joblib.dump(vectorize, "./saved_models/vectorizeGBM.sav")
    
        # Start MLflow
        RUN_NAME = f"run_{idx}"
        
        with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME) as run:
            # Retrieve run id
            RUN_ID = run.info.run_id
    
            # Track parameters
            mlflow.log_param("n_estimators", tree)
            mlflow.log_param("learning Rate", lr)

            # Track metrics
            mlflow.log_metric("accuracy", accuracy)
    
            # Track model
            mlflow.sklearn.log_model(gbm, "classifier")
        




## Run 2nd Experiment using RandomForest

In [6]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [7]:
EXPERIMENT_NAME = "mlflow-RFC"
EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

In [15]:
for idx, tree in enumerate([50, 100, 150,200]):
    for ccp in [0.1, 0.2, 0.5, 0.1]:
        
        rfc = RandomForestClassifier(n_estimators=tree, ccp_alpha = ccp)
        vectorize, accuracy = mt.train_model(corpus=data["text"], target=data["sentiment"], model=gbm, model_name="RFC")
        joblib.dump(vectorize, "/.saved_models/vectorizeRFC.sav")
    
        # Start MLflow
        RUN_NAME = f"run_{idx}"
        
        with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME) as run:
            # Retrieve run id
            RUN_ID = run.info.run_id
    
            # Track parameters
            mlflow.log_param("n_estimators", tree)
            mlflow.log_param("Alpha_Pruning", ccp)

            # Track metrics
            mlflow.log_metric("accuracy", accuracy)
    
            # Track model
            mlflow.sklearn.log_model(gbm, "classifier")


FileNotFoundError: [Errno 2] No such file or directory: '/.saved_models/vectorizeRFC.sav'

## Run 3rd Experiment using ExtraTreeClassifier

In [9]:
EXPERIMENT_NAME = "mlflow-etc"
EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

In [None]:
for idx, tree in enumerate([50, 100, 150,200]):
    for ccp in [0.1, 0.2, 0.5, 0.1]:

        etc = ExtraTreesClassifier(n_estimators= tree,ccp_alpha= ccp )
        vectorize, accuracy = mt.train_model(corpus=data["text"], target=data["sentiment"], model=gbm, model_name="ETC")
        joblib.dump(vectorize, "./saved_models/vectorizeETC.sav")
    
        # Start MLflow
        RUN_NAME = f"run_{idx}"
        
        with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME) as run:
            # Retrieve run id
            RUN_ID = run.info.run_id
    
            # Track parameters
            mlflow.log_param("n_estimators", tree)
            mlflow.log_param("Alpha_Pruning", ccp)

            # Track metrics
            mlflow.log_metric("accuracy", accuracy)
    
            # Track model
            mlflow.sklearn.log_model(gbm, "classifier")


## Checking ML Parameters Programitically

In [11]:
list_of_experiments = ["mlflow-GBM" , "mlflow-RFC", "mlflow-etc"]

In [12]:
import mlflow
import pandas as pd
from mlflow.tracking import MlflowClient

EXPERIMENT_NAME = "mlflow-GBM"

client = MlflowClient()

# Retrieve Experiment information
EXPERIMENT_ID = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

# Retrieve Runs information (parameter 'depth', metric 'accuracy')
ALL_RUNS = client.search_runs([EXPERIMENT_ID], "")
ALL_RUNS_ID = [run.info.run_id for run in ALL_RUNS]
ALL_PARAM = [run.data.params["n_estimators"] for run in ALL_RUNS]
ALL_METRIC = [run.data.metrics["accuracy"] for run in ALL_RUNS]

# View Runs information
run_data = pd.DataFrame({"Run ID": ALL_RUNS_ID, "Params": ALL_PARAM, "Metrics": ALL_METRIC})

# Retrieve Artifact from best run
best_run_id = run_data.sort_values("Metrics", ascending=False).iloc[0]["Run ID"]
best_model_path = client.download_artifacts(best_run_id, "classifier")
best_model = mlflow.sklearn.load_model(best_model_path)


In [13]:
best_model

## Grid Search CV on RandomForest

In [18]:
rfc

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
X_train,X_test,Y_train, Y_test = train_test_split(data["text"], data["sentiment"], test_size=0.25, random_state=30)
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

In [19]:
from sklearn.model_selection import GridSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
params = {'n_estimators': n_estimators,
          'max_features': max_features,
          'max_depth': max_depth,}
grid_search = GridSearchCV(estimator = rfc, param_grid = params,  cv = 3, n_jobs = -1, verbose = 2)

In [26]:
## Train using GridSearch

In [28]:
grid_search = grid_search.fit(tf_x_train, Y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
19 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/BITS/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/BITS/lib/python3.10/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/envs/BITS/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/envs/BITS/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_par

In [29]:
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best parameters are',best_parameters)

Best parameters are {'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 70}


In [5]:
#vectorize = mt.train_model(corpus=data["text"], target=data["sentiment"], model=gbm, model_name="GBM")


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

## BuildingInferencePipeline

In [49]:
loaded_model = joblib.load("./saved_models/GBM.sav")

In [50]:
loaded_model

In [76]:
testingData = "Test this data for Sentiment Analysis please"

In [82]:
ndf = pd.DataFrame()
ndf["text"] = [testingData]
testingdata = vectorize.transform(ndf["text"])
result= loaded_model.predict(testingdata)
result[0]

'neutral'