In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from typing import Tuple

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import mlflow
import os

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, f
from hyperopt.pyll import scope


In [2]:

mlflow.autolog()
mlflow_dir = "/home/sam/Documents/projects/practice/mlops/crop-recomendation"
# mlflow.set_tracking_uri(f"sqlite:///{os.path.join(mlflow_dir, 'mlflow.db')}")
# mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_tracking_uri(os.path.join(mlflow_dir, "mlruns"))

mlflow.set_experiment("crop-recommendation")



2024/07/18 11:16:53 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


<Experiment: artifact_location='/home/sam/Documents/projects/practice/mlops/crop-recomendation/mlruns/885963556934924601', creation_time=1721251298940, experiment_id='885963556934924601', last_update_time=1721251298940, lifecycle_stage='active', name='crop-recommendation', tags={}>

In [3]:
def read(filename: str) -> pd.DataFrame:
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    label_encoder = LabelEncoder()

    # Fit and transform the data
    df['label'] = label_encoder.fit_transform(df['label'])

    with open('../models/label_encoder.bin', 'wb') as f_out:
        pickle.dump(label_encoder, f_out)
    
    return df

In [4]:
def preprocess(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:


    X = df.drop(['label', 'P'], axis=1)
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

    # Step 4: Standardize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    
    return X_train, X_test, y_train, y_test

In [5]:
def train(X_train: np.ndarray, y_train: np.ndarray) -> LogisticRegression:

    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    


    with open('../models/log_reg.bin', 'wb') as f_out:
        pickle.dump(lr, f_out)


    return lr

    

In [6]:
def evaluate(X_test: np.ndarray, y_test: np.ndarray, lr_model=None) -> Tuple[int, int]:

    if lr_model == None:
        with open('../models/log_reg.bin', 'rb') as file:
            lr_model = pickle.load(file)

    y_pred = lr_model.predict(X_test)

    acc_score = accuracy_score(y_test, y_pred)
    confu_mat = confusion_matrix(y_test, y_pred)

    print(f'Accuracy: {acc_score}')
    print(f'Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}')

    return acc_score


In [7]:
def predict(
        feature1: float, feature2: float, 
        feature3: float, feature4: float, 
        feature5: float, feature6: float
        ) -> None:

    with open('../models/label_encoder.bin', 'rb') as f_in:
        label_encoder = pickle.load(f_in)

    with open('../models/log_reg.bin', 'rb') as f_in:
        lr = pickle.load(f_in)

    # Prepare new data for prediction
    new_data = np.array([[feature1, feature2, feature3, feature4, feature5, feature6]])

    # Make a prediction
    prediction = lr.predict(new_data)
    print("Prediction (encoded):", prediction)

    # Reverse the label encoding
    original_label = label_encoder.inverse_transform([round(prediction[0])])
    print("Prediction (original label):", original_label[0])


In [9]:
df = read("../data/Crop_recommendation.csv")

In [10]:
X_train, X_test, y_train, y_test = preprocess(df)

In [19]:
def objective(params: dict):
    with mlflow.start_run():
        mlflow.set_tag("model", "LogisticRegression")
        mlflow.log_params(params)

        # Instantiate the Logistic Regression model
        lr = LogisticRegression(
            C=params['C'],
            # penalty=params['penalty'],
            solver=params['solver'],
            max_iter=params['max_iter'],
            random_state=params['random_state'],
        )

        # Fit the model
        lr.fit(X_train, y_train)

        # Evaluate the model on the test set
        y_pred = lr.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)
        mlflow.log_metric("test_acc_score", acc_score)

    return -acc_score

    # return {'accuracy_score': -acc_score, 'status': STATUS_OK}

# Define the search space
search_space = {
    'C': hp.loguniform('C', -5, 5),
    # 'penalty': hp.choice('penalty', [None, 'l2']),
    'solver': hp.choice('solver', ['liblinear', 'lbfgs']),
    'max_iter': scope.int(hp.quniform('max_iter', 100, 1000, 100)),
    'random_state': 42
}

# Optimize the hyperparameters
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)



  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]




  2%|▏         | 1/50 [00:02<02:02,  2.49s/trial, best loss: -0.7318181818181818]




  4%|▍         | 2/50 [00:04<01:49,  2.28s/trial, best loss: -0.7863636363636364]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



  6%|▌         | 3/50 [00:06<01:49,  2.32s/trial, best loss: -0.7863636363636364]




  8%|▊         | 4/50 [00:09<01:43,  2.24s/trial, best loss: -0.9340909090909091]




 10%|█         | 5/50 [00:11<01:39,  2.20s/trial, best loss: -0.9568181818181818]




 12%|█▏        | 6/50 [00:13<01:39,  2.27s/trial, best loss: -0.9568181818181818]




 14%|█▍        | 7/50 [00:15<01:35,  2.22s/trial, best loss: -0.9568181818181818]




 16%|█▌        | 8/50 [00:17<01:32,  2.20s/trial, best loss: -0.9568181818181818]




 18%|█▊        | 9/50 [00:20<01:32,  2.26s/trial, best loss: -0.9568181818181818]




 20%|██        | 10/50 [00:22<01:29,  2.25s/trial, best loss: -0.9568181818181818]




 22%|██▏       | 11/50 [00:24<01:26,  2.21s/trial, best loss: -0.9568181818181818]




 24%|██▍       | 12/50 [00:27<01:26,  2.29s/trial, best loss: -0.9568181818181818]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



 26%|██▌       | 13/50 [00:29<01:23,  2.25s/trial, best loss: -0.9568181818181818]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



 28%|██▊       | 14/50 [00:31<01:22,  2.29s/trial, best loss: -0.9568181818181818]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



 30%|███       | 15/50 [00:33<01:18,  2.24s/trial, best loss: -0.9568181818181818]




 32%|███▏      | 16/50 [00:35<01:15,  2.23s/trial, best loss: -0.9568181818181818]




 34%|███▍      | 17/50 [00:38<01:15,  2.29s/trial, best loss: -0.9568181818181818]




 36%|███▌      | 18/50 [00:40<01:12,  2.25s/trial, best loss: -0.9568181818181818]




 38%|███▊      | 19/50 [00:42<01:08,  2.22s/trial, best loss: -0.9568181818181818]




 40%|████      | 20/50 [00:45<01:08,  2.29s/trial, best loss: -0.9568181818181818]




 42%|████▏     | 21/50 [00:47<01:05,  2.25s/trial, best loss: -0.9681818181818181]




 44%|████▍     | 22/50 [00:49<01:02,  2.23s/trial, best loss: -0.9704545454545455]




 46%|████▌     | 23/50 [00:52<01:02,  2.31s/trial, best loss: -0.9704545454545455]




 48%|████▊     | 24/50 [00:54<00:59,  2.29s/trial, best loss: -0.9704545454545455]




 50%|█████     | 25/50 [00:56<00:59,  2.37s/trial, best loss: -0.9704545454545455]




 52%|█████▏    | 26/50 [00:59<00:55,  2.31s/trial, best loss: -0.9704545454545455]




 54%|█████▍    | 27/50 [01:01<00:51,  2.26s/trial, best loss: -0.9704545454545455]




 56%|█████▌    | 28/50 [01:03<00:50,  2.31s/trial, best loss: -0.9704545454545455]




 58%|█████▊    | 29/50 [01:05<00:47,  2.28s/trial, best loss: -0.9704545454545455]




 60%|██████    | 30/50 [01:07<00:44,  2.25s/trial, best loss: -0.9704545454545455]




 62%|██████▏   | 31/50 [01:10<00:44,  2.33s/trial, best loss: -0.9704545454545455]




 64%|██████▍   | 32/50 [01:12<00:41,  2.29s/trial, best loss: -0.9704545454545455]




 66%|██████▌   | 33/50 [01:14<00:38,  2.25s/trial, best loss: -0.9704545454545455]




 68%|██████▊   | 34/50 [01:17<00:36,  2.31s/trial, best loss: -0.9704545454545455]




 70%|███████   | 35/50 [01:19<00:34,  2.28s/trial, best loss: -0.9704545454545455]




 72%|███████▏  | 36/50 [01:21<00:31,  2.26s/trial, best loss: -0.9704545454545455]




 74%|███████▍  | 37/50 [01:24<00:30,  2.33s/trial, best loss: -0.9704545454545455]




 76%|███████▌  | 38/50 [01:26<00:27,  2.26s/trial, best loss: -0.9704545454545455]




 78%|███████▊  | 39/50 [01:28<00:24,  2.25s/trial, best loss: -0.9704545454545455]




 80%|████████  | 40/50 [01:30<00:23,  2.31s/trial, best loss: -0.9704545454545455]




 82%|████████▏ | 41/50 [01:33<00:20,  2.26s/trial, best loss: -0.9704545454545455]




 84%|████████▍ | 42/50 [01:35<00:17,  2.22s/trial, best loss: -0.9704545454545455]




 86%|████████▌ | 43/50 [01:37<00:15,  2.28s/trial, best loss: -0.9704545454545455]




 88%|████████▊ | 44/50 [01:39<00:13,  2.25s/trial, best loss: -0.9704545454545455]




 90%|█████████ | 45/50 [01:42<00:11,  2.28s/trial, best loss: -0.9704545454545455]




 92%|█████████▏| 46/50 [01:44<00:08,  2.25s/trial, best loss: -0.9704545454545455]




 94%|█████████▍| 47/50 [01:46<00:06,  2.22s/trial, best loss: -0.9704545454545455]




 96%|█████████▌| 48/50 [01:49<00:05,  2.58s/trial, best loss: -0.9704545454545455]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



 98%|█████████▊| 49/50 [01:52<00:02,  2.71s/trial, best loss: -0.9704545454545455]




100%|██████████| 50/50 [01:55<00:00,  2.31s/trial, best loss: -0.9704545454545455]


In [1]:
best_result

NameError: name 'best_result' is not defined

In [45]:
lr = train(X_train, y_train, X_test, y_test)



In [84]:
acc_score= evaluate(X_test, y_test, lr)

Accuracy: 0.9386363636363636
Confusion Matrix:
 [[26  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 19  0  0  0  0  0  0  0  3  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 18  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 20  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 25  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 23  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 24  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 21  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 19  0  0  4  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 22  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  2  0  1 15  0  0  0  0

In [89]:
predict(10,	81,	10.879744, 2.002744, 3.502985, 2.935536)

Prediction (encoded): [7]
Prediction (original label): grapes


In [None]:
df.sort_values()