# **Using AutoML for Modeling**

**Pulling Final Dataset from Github**

In [2]:
!git clone https://github.com/rajbhanb/MLOPSFinalProject.git
%cd MLOPSFinalProject


fatal: destination path 'MLOPSFinalProject' already exists and is not an empty directory.
/content/MLOPSFinalProject


In [3]:
import pandas as pd

df = pd.read_csv("data/v2_final/final_data.csv")
df


Unnamed: 0,Lag_1_SPY_Close,Lag_1_SPY_Daily_Return,Lag_1_SPY_7D_Return,Lag_1_SPY_30D_Return,Lag_1_VIX_Daily_Change,Lag_1_VIX_7D_MA,Lag_1_VIX_30D_MA,Lag_1_VIX_Close,Lag_1_VADER_Compound_Score,Lag_1_TextBlob_Polarity,Lag_1_TextBlob_Subjectivity,Sentiment_Label
0,97.614151,0.002952,0.021417,-0.037326,-0.046503,25.622857,26.366667,24.400000,-0.5106,0.000000,0.000000,2
1,96.802612,-0.008314,0.007775,-0.029660,0.029508,25.268572,26.356333,25.120001,0.0000,0.380000,0.770000,2
2,99.265999,-0.001012,0.022791,-0.021659,0.036073,23.827143,26.274000,22.690001,0.0000,0.200000,0.200000,2
3,98.296463,-0.009767,0.009962,-0.009408,0.037021,23.532857,26.280333,23.530001,0.5267,0.000000,0.000000,2
4,95.509941,-0.003820,-0.019392,0.007042,-0.028919,24.212858,26.224334,25.520000,-0.4019,-0.062500,0.625000,2
...,...,...,...,...,...,...,...,...,...,...,...,...
3490,497.062714,0.000690,0.027870,0.065658,-0.054333,14.525714,13.750667,13.750000,0.5106,0.433333,0.733333,1
3491,495.242188,-0.003663,0.014882,0.062223,-0.000727,14.434286,13.794000,13.740000,0.0000,0.000000,0.000000,1
3492,496.162231,0.001858,0.009800,0.063460,-0.022562,14.351429,13.818333,13.430000,0.0000,0.100000,0.100000,1
3493,495.506531,-0.001322,0.013513,0.065968,0.030529,14.294286,13.818333,13.840000,0.1027,0.000000,0.000000,1


In [4]:
df.dtypes

Unnamed: 0,0
Lag_1_SPY_Close,float64
Lag_1_SPY_Daily_Return,float64
Lag_1_SPY_7D_Return,float64
Lag_1_SPY_30D_Return,float64
Lag_1_VIX_Daily_Change,float64
Lag_1_VIX_7D_MA,float64
Lag_1_VIX_30D_MA,float64
Lag_1_VIX_Close,float64
Lag_1_VADER_Compound_Score,float64
Lag_1_TextBlob_Polarity,float64


In [7]:
!pip install scikit-learn imbalanced-learn flaml
%pip install flaml
%pip install --upgrade flaml[autozero] lightgbm openml pyspark pandas

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting pyspark
  Downloading pyspark-4.0.1.tar.gz (434.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.2/434.2 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-1.0.2-py3-none-any.whl.metadata (15 kB)
Collecting minio (from openml)
  Downloading minio-7.2.20-py3-none-any.whl.metadata (6.5 kB)
Collecting py4j==0.10.9.9 (from pyspark)
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from flaml import AutoML
from joblib import dump
from sklearn.impute import SimpleImputer


**Load and Prepare Your Data**

In [5]:
# Separate features and target
X = df.drop(columns=['Sentiment_Label'])
y = df['Sentiment_Label']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


**Build the Pipeline**

In [20]:
# --- Pipeline 1: FLAML with class_weight='balanced' ---
# Define a custom classifier with class_weight='balanced'
def get_balanced_automl():
    automl = AutoML()
    automl_settings = {
        "time_budget": 60,  # in seconds
        "metric": 'f1',  # Use F1-score for imbalanced data
        "task": 'classification',
        "log_file_name": 'flaml_balanced.log',
        "estimator_list": ['rf', 'xgboost', 'lgbm'],  # Use tree-based models that support class_weight
        "n_jobs": 4,
        "verbose": 2,
    }
    return automl

In [21]:
pipeline_balanced = Pipeline([
    ('scaler', StandardScaler()),
    ('automl', get_balanced_automl())
])

In [22]:
# --- Pipeline 2: FLAML with SMOTE ---
pipeline_smote = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('automl', AutoML())
])


**Compute class weights automatically due to Imbalanced Data**

In [14]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
class_weights = compute_class_weight("balanced", classes=classes, y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(classes, class_weights)}

print("Class Weights:", class_weight_dict)

Class Weights: {np.int64(0): np.float64(3.9159663865546217), np.int64(1): np.float64(0.4983957219251337), np.int64(2): np.float64(1.3546511627906976)}


In [23]:
# Train Pipeline 1 (FLAML with class_weight='balanced' workaround)
print("Training Pipeline with class_weight='balanced' (FLAML)...")
pipeline_balanced.fit(X_train, y_train)

Training Pipeline with class_weight='balanced' (FLAML)...
[flaml.automl.logger: 12-08 02:24:36] {1752} INFO - task = classification
[flaml.automl.logger: 12-08 02:24:36] {1763} INFO - Evaluation method: cv
[flaml.automl.logger: 12-08 02:24:36] {1862} INFO - Minimizing error metric: log_loss


INFO:flaml.default.suggest:metafeature distance: 0.1704765201915977
INFO:flaml.default.suggest:metafeature distance: 0.1704765201915977
INFO:flaml.default.suggest:metafeature distance: 0.1706637663700689
INFO:flaml.default.suggest:metafeature distance: 0.1706637663700689
INFO:flaml.default.suggest:metafeature distance: 0.1706637663700689
INFO:flaml.default.suggest:metafeature distance: 0.1704765201915977


[flaml.automl.logger: 12-08 02:24:36] {1979} INFO - List of ML learners in AutoML Run: ['rf', 'lgbm', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 12-08 02:24:36] {2282} INFO - iteration 0, current learner rf
[flaml.automl.logger: 12-08 02:25:23] {2417} INFO - Estimated sufficient time budget=10000s. Estimated necessary time budget=10s.
[flaml.automl.logger: 12-08 02:25:23] {2466} INFO -  at 47.6s,	estimator rf's best error=0.2555,	best estimator rf's best error=0.2555
[flaml.automl.logger: 12-08 02:25:23] {2282} INFO - iteration 1, current learner lgbm




[flaml.automl.logger: 12-08 02:25:26] {2466} INFO -  at 50.0s,	estimator lgbm's best error=0.2075,	best estimator lgbm's best error=0.2075
[flaml.automl.logger: 12-08 02:25:26] {2282} INFO - iteration 2, current learner xgboost




[flaml.automl.logger: 12-08 02:26:55] {2466} INFO -  at 139.0s,	estimator xgboost's best error=0.2069,	best estimator xgboost's best error=0.2069
[flaml.automl.logger: 12-08 02:26:55] {2282} INFO - iteration 3, current learner extra_tree
[flaml.automl.logger: 12-08 02:27:43] {2466} INFO -  at 187.9s,	estimator extra_tree's best error=0.1947,	best estimator extra_tree's best error=0.1947
[flaml.automl.logger: 12-08 02:27:43] {2282} INFO - iteration 4, current learner xgb_limitdepth
[flaml.automl.logger: 12-08 02:27:46] {2466} INFO -  at 190.4s,	estimator xgb_limitdepth's best error=0.2149,	best estimator extra_tree's best error=0.1947
[flaml.automl.logger: 12-08 02:27:46] {2282} INFO - iteration 5, current learner sgd


INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 12-08 02:27:47] {2466} INFO -  at 190.9s,	estimator sgd's best error=0.3925,	best estimator extra_tree's best error=0.1947
[flaml.automl.logger: 12-08 02:27:47] {2282} INFO - iteration 6, current learner lrl1


INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 12-08 02:27:47] {2466} INFO -  at 191.6s,	estimator lrl1's best error=0.1978,	best estimator extra_tree's best error=0.1947
[flaml.automl.logger: 12-08 02:27:57] {2724} INFO - retrain extra_tree for 10.0s
[flaml.automl.logger: 12-08 02:27:57] {2727} INFO - retrained model: ExtraTreesClassifier(criterion='entropy', max_features=np.float64(1.0),
                     max_leaf_nodes=18344, n_estimators=2047, n_jobs=-1,
                     random_state=12032022)
[flaml.automl.logger: 12-08 02:27:57] {2009} INFO - fit succeeded
[flaml.automl.logger: 12-08 02:27:57] {2010} INFO - Time taken to find the best model: 187.8735547065735


In [24]:
# Train Pipeline 2 (FLAML with SMOTE)
print("\nTraining Pipeline with SMOTE (FLAML)...")
pipeline_smote.fit(X_train, y_train)


Training Pipeline with SMOTE (FLAML)...
[flaml.automl.logger: 12-08 02:27:57] {1752} INFO - task = classification
[flaml.automl.logger: 12-08 02:27:57] {1763} INFO - Evaluation method: cv
[flaml.automl.logger: 12-08 02:27:57] {1862} INFO - Minimizing error metric: log_loss


INFO:flaml.default.suggest:metafeature distance: 0.1862256834507508
INFO:flaml.default.suggest:metafeature distance: 0.1862256834507508
INFO:flaml.default.suggest:metafeature distance: 0.18720906239224108
INFO:flaml.default.suggest:metafeature distance: 0.18720906239224108
INFO:flaml.default.suggest:metafeature distance: 0.18720906239224108
INFO:flaml.default.suggest:metafeature distance: 0.1862256834507508


[flaml.automl.logger: 12-08 02:27:57] {1979} INFO - List of ML learners in AutoML Run: ['rf', 'lgbm', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 12-08 02:27:57] {2282} INFO - iteration 0, current learner rf
[flaml.automl.logger: 12-08 02:29:42] {2417} INFO - Estimated sufficient time budget=10000s. Estimated necessary time budget=10s.
[flaml.automl.logger: 12-08 02:29:42] {2466} INFO -  at 104.4s,	estimator rf's best error=0.1571,	best estimator rf's best error=0.1571
[flaml.automl.logger: 12-08 02:29:42] {2282} INFO - iteration 1, current learner lgbm




[flaml.automl.logger: 12-08 02:29:45] {2466} INFO -  at 108.0s,	estimator lgbm's best error=0.1477,	best estimator lgbm's best error=0.1477
[flaml.automl.logger: 12-08 02:29:45] {2282} INFO - iteration 2, current learner xgboost




[flaml.automl.logger: 12-08 02:31:24] {2466} INFO -  at 206.9s,	estimator xgboost's best error=0.1634,	best estimator lgbm's best error=0.1477
[flaml.automl.logger: 12-08 02:31:24] {2282} INFO - iteration 3, current learner extra_tree
[flaml.automl.logger: 12-08 02:32:36] {2466} INFO -  at 278.8s,	estimator extra_tree's best error=0.1104,	best estimator extra_tree's best error=0.1104
[flaml.automl.logger: 12-08 02:32:36] {2282} INFO - iteration 4, current learner xgb_limitdepth
[flaml.automl.logger: 12-08 02:32:40] {2466} INFO -  at 282.8s,	estimator xgb_limitdepth's best error=0.1622,	best estimator extra_tree's best error=0.1104
[flaml.automl.logger: 12-08 02:32:40] {2282} INFO - iteration 5, current learner sgd


INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 12-08 02:32:41] {2466} INFO -  at 283.7s,	estimator sgd's best error=0.5306,	best estimator extra_tree's best error=0.1104
[flaml.automl.logger: 12-08 02:32:41] {2282} INFO - iteration 6, current learner lrl1


INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 12-08 02:32:42] {2466} INFO -  at 285.0s,	estimator lrl1's best error=0.2105,	best estimator extra_tree's best error=0.1104
[flaml.automl.logger: 12-08 02:32:58] {2724} INFO - retrain extra_tree for 16.0s
[flaml.automl.logger: 12-08 02:32:58] {2727} INFO - retrained model: ExtraTreesClassifier(criterion='entropy', max_features=np.float64(1.0),
                     max_leaf_nodes=18344, n_estimators=2047, n_jobs=-1,
                     random_state=12032022)
[flaml.automl.logger: 12-08 02:32:58] {2009} INFO - fit succeeded
[flaml.automl.logger: 12-08 02:32:58] {2010} INFO - Time taken to find the best model: 278.768541097641


In [26]:
from sklearn.metrics import classification_report, confusion_matrix

# Predictions for Pipeline 1
y_pred_balanced = pipeline_balanced.predict(X_test)

# Predictions for Pipeline 2
y_pred_smote = pipeline_smote.predict(X_test)

# Print evaluation metrics for Pipeline 1
print("\n=== Pipeline with FLAML (class_weight='balanced' workaround) ===")
print(classification_report(y_test, y_pred_balanced))
print(confusion_matrix(y_test, y_pred_balanced))

# Print evaluation metrics for Pipeline 2
print("\n=== Pipeline with FLAML + SMOTE ===")
print(classification_report(y_test, y_pred_smote))
print(confusion_matrix(y_test, y_pred_smote))


=== Pipeline with FLAML (class_weight='balanced' workaround) ===
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        60
           1       0.95      0.97      0.96       467
           2       0.84      0.80      0.82       172

    accuracy                           0.91       699
   macro avg       0.87      0.86      0.86       699
weighted avg       0.91      0.91      0.91       699

[[ 48   1  11]
 [  0 451  16]
 [ 11  23 138]]

=== Pipeline with FLAML + SMOTE ===
              precision    recall  f1-score   support

           0       0.76      0.83      0.79        60
           1       0.96      0.95      0.96       467
           2       0.82      0.81      0.81       172

    accuracy                           0.91       699
   macro avg       0.85      0.86      0.85       699
weighted avg       0.91      0.91      0.91       699

[[ 50   1   9]
 [  0 445  22]
 [ 16  17 139]]


**The Beter Model: Maximizing Fear (Class 0) Recall**

The primary goal of the model is to inform portfolio allocation, where Fear (Class 0) is the most critical and rare event, dictating a need to "go defensive."

Risk of False Negative (Type II Error): A False Negative for Fear (predicting Greed or Neutral when the sentiment is actually Fear) means the model failed to detect the danger. If we miss a true Fear signal, we remain aggressively invested, exposing the portfolio to significant, unexpected downside risk.

SMOTE Model Performance: The SMOTE model has a higher Recall for Class 0 (0.83 vs. 0.80).

This means it correctly identifies 83% of the true Fear instances, compared to 80% for the Class Weight model.

In the confusion matrix, the SMOTE model had 50 True Positives for Fear, while the Class Weight model had 48.

While the SMOTE model has a slightly lower F1-score for Fear (due to a small drop in precision), the extra safety provided by higher recall for this crucial, defensive signal is generally prioritized in risk management scenarios. Missing a defensive signal is often considered a more costly error than occasionally generating a false alarm (low precision).

**Save the Best Pipeline**

In [27]:
dump(pipeline_smote, 'flaml_balanced_sentiment_pipeline.joblib')


['flaml_balanced_sentiment_pipeline.joblib']

# Dockerize FastAPI app

sentiment_api/

├── app.py                  # FastAPI app

├── requirements.txt        # Python dependencies

├── flaml_balanced_sentiment_pipeline.joblib  # Your saved pipeline

└── Dockerfile              # Docker configuration

**Requirment Text File**

In [34]:
pip freeze | grep -E 'fastapi|uvicorn|joblib|pandas|flaml|scikit-learn' > requirements.txt


In [65]:
pip freeze | grep -E 'fastapi|uvicorn|joblib|pandas|flaml|imbalanced-learn|scikit-learn'


fastapi==0.118.3
geopandas==1.1.1
imbalanced-learn==0.14.0
joblib==1.5.2
pandas==2.3.3
pandas-datareader==0.10.0
pandas-gbq==0.30.0
pandas-stubs==2.2.2.240909
scikit-learn==1.6.1
sklearn-pandas==2.2.0
uvicorn==0.38.0


In [35]:
from google.colab import files
files.download('requirements.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [53]:
# 0️⃣ Clean up before starting (Optional, for re-running)
!rm -rf MLOPSFinalProject DockerFLAML

# 1️⃣ Set Git identity
!git config --global user.email "bimalsen07@gmail.com"
!git config --global user.name "rajbhanb"

# 2️⃣ Clone your GitHub repo
repo_url = "https://github.com/rajbhanb/MLOPSFinalProject.git"
!git clone {repo_url}
%cd MLOPSFinalProject

# 3️⃣ Create DockerFLAML folder inside the repo
import os
os.makedirs("DockerFLAML", exist_ok=True)

Cloning into 'MLOPSFinalProject'...
remote: Enumerating objects: 54, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 54 (delta 17), reused 31 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (54/54), 1.28 MiB | 5.51 MiB/s, done.
Resolving deltas: 100% (17/17), done.
/content/DockerFLAML/MLOPSFinalProject/MLOPSFinalProject


In [54]:
# 3️⃣ Write app.py
app_py = """
from fastapi import FastAPI
from joblib import load
from pydantic import BaseModel
import pandas as pd

app = FastAPI()
pipeline = load('flaml_balanced_sentiment_pipeline.joblib')

class Features(BaseModel):
    Lag_1_SPY_Close: float
    Lag_1_SPY_Daily_Return: float
    Lag_1_SPY_7D_Return: float
    Lag_1_SPY_30D_Return: float
    Lag_1_VIX_Daily_Change: float
    Lag_1_VIX_7D_MA: float
    Lag_1_VIX_30D_MA: float
    Lag_1_VIX_Close: float
    Lag_1_VADER_Compound_Score: float
    Lag_1_TextBlob_Polarity: float
    Lag_1_TextBlob_Subjectivity: float

@app.post("/predict")
def predict(features: Features):
    input_data = pd.DataFrame([features.dict()])
    prediction = pipeline.predict(input_data)
    return {"prediction": int(prediction[0])}
"""

with open("/content/DockerFLAML/app.py", "w") as f:
    f.write(app_py)

# 4️⃣ Generate requirements.txt (minimal)
!pip freeze | grep -E 'fastapi|uvicorn|joblib|pandas|flaml|scikit-learn' > /content/DockerFLAML/requirements.txt

# 5️⃣ Write Dockerfile
dockerfile = """
FROM python:3.9-slim

WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
"""

with open("/content/DockerFLAML/Dockerfile", "w") as f:
    f.write(dockerfile)

# 6️⃣ Copy your saved model into the folder
import shutil
shutil.copy("/content/MLOPSFinalProject/flaml_balanced_sentiment_pipeline.joblib", "/content/DockerFLAML/flaml_balanced_sentiment_pipeline.joblib")



'/content/DockerFLAML/flaml_balanced_sentiment_pipeline.joblib'

**Model is too big to upload to Github**

In [64]:
import shutil
import os

# Path to cloned repo
repo_path = "/content/MLOPSFinalProject"

# Move the entire DockerFLAML folder into the repo
shutil.move("/content/DockerFLAML", os.path.join(repo_path, "DockerFLAML"))

# Go to repo folder
%cd /content/MLOPSFinalProject

# Add DockerFLAML folder to Git
!git add DockerFLAML
!git status  # you should see 4 new files
!git commit -m "Add DockerFLAML files for FLAML model"

# Push to main branch
token = "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"

!git push https://{token}@github.com/rajbhanb/MLOPSFinalProject.git main

/content/MLOPSFinalProject
[33mhint: You've added another git repository inside your current repository.[m
[33mhint: Clones of the outer repository will not contain the contents of[m
[33mhint: the embedded repository and will not know how to obtain it.[m
[33mhint: If you meant to add a submodule, use:[m
[33mhint: [m
[33mhint: 	git submodule add <url> DockerFLAML/DockerFLAML/MLOPSFinalProject[m
[33mhint: [m
[33mhint: If you added this path by mistake, you can remove it from the[m
[33mhint: index with:[m
[33mhint: [m
[33mhint: 	git rm --cached DockerFLAML/DockerFLAML/MLOPSFinalProject[m
[33mhint: [m
[33mhint: See "git help submodule" for more information.[m
On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mnew file:   DockerFLAML/DockerFLAML/Dockerfile[m
	[32mnew file:   DockerFLAML/DockerFLAML/MLOPSFinalProject[m
	[32mnew file:   DockerFLAML/DockerFLAML/app.py[m
