In [None]:
import mlflow
from mlflow.tracking import MlflowClient

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

client = MlflowClient()

# Use the search_experiments function to get a list of available experiments
experiments = client.search_experiments()
print(len(experiments))

# Display the list of available experiments
for experiment in experiments:
    print(f"Experiment Name: {experiment.name}, Experiment ID: {experiment.experiment_id}")

Loading Data

In [None]:
import pandas as pd
def load_data(path):
    return pd.read_csv(path)

df = load_data('Data/diamonds.csv')
df.head()

Cleaning Data

In [None]:
# Convert byte literals to regular strings for specific columns
byte_literal_columns = ['cut', 'color', 'clarity']
for column in byte_literal_columns:
    df[column] = df[column].str.strip("b'")

In [None]:
# Check for missing values
print(df.isnull().sum())

In [None]:
# Remove any duplicate rows
df = df.drop_duplicates()

In [None]:
# Remove any rows with missing values
df = df.dropna()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Print out unique values in each categorical column
print("Unique values in 'cut' column:", df['cut'].unique())
print("Unique values in 'color' column:", df['color'].unique())
print("Unique values in 'clarity' column:", df['clarity'].unique())

In [None]:
df.head()

In [None]:
# Save the cleaned dataset
df.to_csv('data/clean_diamonds_final.csv', index=False)

In [None]:
!pip3 install --upgrade matplotlib

Visualization

The below bar charts will show the number of diamonds in each category, which will help us understand why these variables are considered categorical:

cut: The quality of the cut is a categorical variable because it describes the cut quality of the diamond in ordered categories such as 'Ideal', 'Premium', 'Good', etc.

color: The color of the diamond is a categorical variable because it is rated on a scale from D (best) to J (worst), representing discrete groups.

clarity: The clarity of the diamond is a categorical variable because it describes the level of flaws in the diamond using categories like 'SI1', 'VS1', 'VVS2', etc.


These visualizations will show that each of these variables contains a limited number of distinct categories, which is a characteristic of categorical variables.

In [None]:
import plotly.express as px

# Assuming 'df' is your dataframe and it has been loaded correctly from the provided CSV file

# Interactive bar plot for 'cut'
cut_counts = df['cut'].value_counts().reset_index()
cut_counts.columns = ['cut', 'count']  # Rename the columns appropriately
fig = px.bar(cut_counts, x='cut', y='count')
fig.update_layout(title_text='Distribution of Cut Quality', xaxis_title='Cut', yaxis_title='Frequency')
fig.show()

# Interactive bar plot for 'color'
color_counts = df['color'].value_counts().reset_index()
color_counts.columns = ['color', 'count']  # Rename the columns appropriately
fig = px.bar(color_counts, x='color', y='count')
fig.update_layout(title_text='Distribution of Diamond Color', xaxis_title='Color', yaxis_title='Frequency')
fig.show()

# Interactive bar plot for 'clarity'
clarity_counts = df['clarity'].value_counts().reset_index()
clarity_counts.columns = ['clarity', 'count']  # Rename the columns appropriately
fig = px.bar(clarity_counts, x='clarity', y='count')
fig.update_layout(title_text='Distribution of Diamond Clarity', xaxis_title='Clarity', yaxis_title='Frequency')
fig.show()

In [None]:
CATEGORICAL_COLS = ["cut", "color", "clarity"]

In [None]:
from typing import List
from sklearn.feature_extraction import DictVectorizer

def encode_cols(df: pd.DataFrame, categorical_cols: List[str] = None) -> pd.DataFrame:
    if categorical_cols is None:
        categorical_cols = ["cut", "color", "clarity"]
        
    df[categorical_cols] = df[categorical_cols].apply(lambda x: x.astype(str).str.lower())
    return df


def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> dict:
    if categorical_cols is None:
         categorical_cols = ["cut", "color", "clarity"]
    dicts = df[[*categorical_cols]].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["price"].values

    x = dv.transform(dicts)
    return x, y, dv

# save the preprocessor into saved_pkl folder
import pickle
def save_picked(path: str, file):
    with open(path, "wb") as f:
        pickle.dump(file, f)

In [None]:
!pip3 install xgboost

In [None]:
# Train model
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def train_model(x_train: pd.DataFrame, y_train: np.ndarray, model_type):
    # model = None
    random = RandomForestRegressor(random_state=42, n_estimators=25,
                              max_depth=60, min_samples_leaf=1, min_samples_split=5)
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.25, n_estimators=300,
                                max_depth=6, subsample=1, colsample_bytree=1)
    linear = LinearRegression(fit_intercept=True, copy_X=True)

    model_map = {
        "randomforest": random,
        "xgb": xgb_reg,
        "linear": linear
    }
    model = model_map.get(model_type, None)
    if model is None:
        raise ValueError(f"Invalid model type: {model_type}")
    
    model.fit(x_train, y_train)
    return model

def predict_price(input_data, model):
    return model.predict(input_data)

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    rmse = round(np.sqrt(mean_squared_error(y_true, y_pred)), 2)
    mae = round(mean_absolute_error(y_true, y_pred), 2)
    r2 = round(r2_score(y_true, y_pred), 4)
    return rmse, mae, r2

In [None]:
from sklearn.model_selection import train_test_split
# try all steps
df = load_data('data/clean_diamonds_final.csv')
df.to_csv("data/clean_diamonds_final.csv", index=False)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv("data/train-set.csv")
test_df.to_csv("data/test-set.csv")
train_df = encode_cols(train_df)
test_df = encode_cols(test_df)
X_train, y_train, dv = extract_x_y(train_df)
X_test, y_test, _ = extract_x_y(test_df, dv=dv)

In [None]:
model = train_model(X_train, y_train, "randomforest")
pred = predict_price(X_test, model)
rmse, mae, r2 = evaluate_model(y_test, pred)
print(f"r2: {r2}")

In [None]:
model = train_model(X_train, y_train, "xgb")
pred = predict_price(X_test, model)
rmse, mae, r2 = evaluate_model(y_test, pred)
print(f"r2: {r2}")

In [None]:
model = train_model(X_train, y_train, "linear")
pred = predict_price(X_test, model)
rmse, mae, r2 = evaluate_model(y_test, pred)
print(f"r2: {r2}")

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from typing import List

# Set the experiment name
mlflow.set_experiment("diamonds_price_predictor")

# Check if there's an active run, and end it if necessary
if mlflow.active_run():
    mlflow.end_run()

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("experiment_id", run_id)

    # Load data - assuming you have functions to load your data
    train_df = load_data("data/train-set.csv")
    test_df = load_data("data/test-set.csv")

    # Preprocess the text data
    train_text = train_df[['cut', 'color', 'clarity']].apply(lambda x: ' '.join(x), axis=1).tolist()
    test_text = test_df[['cut', 'color', 'clarity']].apply(lambda x: ' '.join(x), axis=1).tolist()

    # Vectorize the text data
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_text)
    X_test = vectorizer.transform(test_text)

    y_train = train_df['price']
    y_test = test_df['price']

    # Train XGBoost model
    xgb_model = XGBRegressor(random_state=0)
    xgb_model.fit(X_train, y_train)

    # Evaluate XGBoost model
    y_train_pred_xgb = xgb_model.predict(X_train)
    train_mae_xgb = mean_absolute_error(y_train, y_train_pred_xgb)
    train_mse_xgb = mean_squared_error(y_train, y_train_pred_xgb)
    train_r2_xgb = r2_score(y_train, y_train_pred_xgb)

    mlflow.log_metric("train_mae_xgb", train_mae_xgb)
    mlflow.log_metric("train_mse_xgb", train_mse_xgb)
    mlflow.log_metric("train_r2_xgb", train_r2_xgb)

    # Train Random Forest model
    rf_model = RandomForestRegressor(random_state=0)
    rf_model.fit(X_train, y_train)

    # Evaluate Random Forest model
    y_train_pred_rf = rf_model.predict(X_train)
    train_mae_rf = mean_absolute_error(y_train, y_train_pred_rf)
    train_mse_rf = mean_squared_error(y_train, y_train_pred_rf)
    train_r2_rf = r2_score(y_train, y_train_pred_rf)

    mlflow.log_metric("train_mae_rf", train_mae_rf)
    mlflow.log_metric("train_mse_rf", train_mse_rf)
    mlflow.log_metric("train_r2_rf", train_r2_rf)

    # Train Linear Regression model
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)

    # Evaluate Linear Regression model
    y_train_pred_lr = lr_model.predict(X_train)
    train_mae_lr = mean_absolute_error(y_train, y_train_pred_lr)
    train_mse_lr = mean_squared_error(y_train, y_train_pred_lr)
    train_r2_lr = r2_score(y_train, y_train_pred_lr)

    mlflow.log_metric("train_mae_lr", train_mae_lr)
    mlflow.log_metric("train_mse_lr", train_mse_lr)
    mlflow.log_metric("train_r2_lr", train_r2_lr)

    # Log the models
    mlflow.sklearn.log_model(xgb_model, "xgb_model")
    mlflow.sklearn.log_model(rf_model, "rf_model")
    mlflow.sklearn.log_model(lr_model, "lr_model")

    # Register the models in MLflow Model Registry
    mlflow.register_model("runs:/{}/xgb_model".format(run_id), "diamond_price_predictor_xgb_v3")
    mlflow.register_model("runs:/{}/rf_model".format(run_id), "diamond_price_predictor_rf_v3")
    mlflow.register_model("runs:/{}/lr_model".format(run_id), "diamond_price_predictor_lr_v3")


In [None]:
from mlflow.tracking import MlflowClient

# Initialize MLflow tracking client
client = MlflowClient()

# Set the correct model type and experiment path
model_type = "linear"
mlflow_experiment_path = 'diamond_price_predictor_v1'

# Specify the version of the model to be transitioned
production_version = 1

# Transition the specified model version to the "Production" stage
client.transition_model_version_stage(name=mlflow_experiment_path, version=production_version, stage="Production")

In [None]:
!mlflow ui --host 0.0.0.0 --port 5002

In [None]:
import pickle

def save_pickle(file, path):
    """
    Save the file using pickle.
    
    Parameters:
        file: Any - The object to be saved.
        path: str - The path to save the file.
    """
    with open(path, "wb") as f:
        pickle.dump(file, f)

# Example usage:
save_pickle(model, "/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/Model_v/model.pkl")
save_pickle(dv, "/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/dv_v/dv.pkl")

In [None]:
# from config import PATH_TO_MODEL, PATH_TO_PREPROCESSOR
# Load production model
model_uri = f"models:/{mlflow_experiment_path}/production"
model = mlflow.sklearn.load_model(model_uri)
save_picked("/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/Model_v/model.pkl", model)

def load_pickle(path):
    with open(path, "rb") as f:
        file = pickle.load(f)
    return file

dv = load_pickle("/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/dv_v/dv.pkl")
model = load_pickle("/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/Model_v/model.pkl")

In [None]:
!mlflow server --host 127.0.0.1 --port 8080