In [1]:
import pandas as pd
import joblib
import modeltraining as mt
import numpy as np
import mlflow
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("./data/Twitter_Data.csv")

In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [4]:
EXPERIMENT_NAME = "mlflow-GBM"
EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

In [5]:
for idx, tree in enumerate([50, 100, 150,200]):
    gbm = GradientBoostingClassifier(n_estimators=tree)
    vectorize, accuracy = mt.train_model(corpus=data["text"], target=data["sentiment"], model=gbm, model_name="GBM")

    # Start MLflow
    RUN_NAME = f"run_{idx}"
    
    with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME) as run:
        # Retrieve run id
        RUN_ID = run.info.run_id

        # Track parameters
        mlflow.log_param("n_estimators", tree)

        # Track metrics
        mlflow.log_metric("accuracy", accuracy)

        # Track model
        mlflow.sklearn.log_model(gbm, "classifier")
    




In [5]:
vectorize = mt.train_model(corpus=data["text"], target=data["sentiment"], model=gbm, model_name="GBM")

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
loaded_model = joblib.load("./saved_models/GBM.sav")

In [9]:
text = pd.read_csv("./data/test_data.csv")

In [10]:
tf_x_test = vectorize.transform(text["text"])

In [12]:
y_pred = loaded_model.predict(tf_x_test)

In [13]:
from sklearn.metrics import accuracy_score

In [15]:
accuracy_score(y_pred, text["sentiment"])

0.5