In [55]:
#!pip install --upgrade pip


In [56]:
import mlflow
from mlflow.tracking import MlflowClient

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

client = MlflowClient()

# Use the search_experiments function to get a list of available experiments
experiments = client.search_experiments()
print(len(experiments))

# Display the list of available experiments
for experiment in experiments:
    print(f"Experiment Name: {experiment.name}, Experiment ID: {experiment.experiment_id}")

tracking URI: 'file:///Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/mlruns'
2
Experiment Name: diamonds_price_predictor, Experiment ID: 779128597837821808
Experiment Name: Default, Experiment ID: 0


Loading Data

In [57]:
import pandas as pd
def load_data(path):
    return pd.read_csv(path)

df = load_data('Data/clean_diamonds_final.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,'x','y','z'
0,0.23,Ideal,E,SI2,61.5,55.0,326.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335.0,4.34,4.35,2.75


Cleaning Data

In [58]:
# Convert byte literals to regular strings for specific columns
"""byte_literal_columns = ['cut', 'color', 'clarity']
for column in byte_literal_columns:
    df[column] = df[column].str.strip("b'")"""

'byte_literal_columns = [\'cut\', \'color\', \'clarity\']\nfor column in byte_literal_columns:\n    df[column] = df[column].str.strip("b\'")'

In [59]:
# Check for missing values
print(df.isnull().sum())

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
'x'        0
'y'        0
'z'        0
dtype: int64


In [60]:
# Remove any duplicate rows
df = df.drop_duplicates()

In [61]:
# Remove any rows with missing values
df = df.dropna()

In [62]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', ''x'',
       ''y'', ''z''],
      dtype='object')

In [63]:
df.shape

(53794, 10)

In [64]:
df.nunique()

carat        273
cut            5
color          7
clarity        8
depth        184
table        127
price      11602
'x'          554
'y'          552
'z'          375
dtype: int64

In [65]:
df.describe()

Unnamed: 0,carat,depth,table,price,'x','y','z'
count,53794.0,53794.0,53794.0,53794.0,53794.0,53794.0,53794.0
mean,0.79778,61.74808,57.458109,3933.065082,5.731214,5.734653,3.538714
std,0.47339,1.429909,2.233679,3988.11446,1.120695,1.141209,0.705037
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,951.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5326.75,6.54,6.54,4.03
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53794 entries, 0 to 53793
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53794 non-null  float64
 1   cut      53794 non-null  object 
 2   color    53794 non-null  object 
 3   clarity  53794 non-null  object 
 4   depth    53794 non-null  float64
 5   table    53794 non-null  float64
 6   price    53794 non-null  float64
 7   'x'      53794 non-null  float64
 8   'y'      53794 non-null  float64
 9   'z'      53794 non-null  float64
dtypes: float64(7), object(3)
memory usage: 4.1+ MB


In [67]:
# Print out unique values in each categorical column
print("Unique values in 'cut' column:", df['cut'].unique())
print("Unique values in 'color' column:", df['color'].unique())
print("Unique values in 'clarity' column:", df['clarity'].unique())

Unique values in 'cut' column: ['Ideal' 'Premium' 'Good' 'Very Good' 'Fair']
Unique values in 'color' column: ['E' 'I' 'J' 'H' 'F' 'G' 'D']
Unique values in 'clarity' column: ['SI2' 'SI1' 'VS1' 'VS2' 'VVS2' 'VVS1' 'I1' 'IF']


In [68]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,'x','y','z'
0,0.23,Ideal,E,SI2,61.5,55.0,326.0,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326.0,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327.0,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334.0,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335.0,4.34,4.35,2.75


In [69]:
# Save the cleaned dataset
# df.to_csv('data/clean_diamonds_final.csv', index=False)

In [70]:
#!pip3 install --upgrade matplotlib

Visualization

The below bar charts will show the number of diamonds in each category, which will help us understand why these variables are considered categorical:

cut: The quality of the cut is a categorical variable because it describes the cut quality of the diamond in ordered categories such as 'Ideal', 'Premium', 'Good', etc.

color: The color of the diamond is a categorical variable because it is rated on a scale from D (best) to J (worst), representing discrete groups.

clarity: The clarity of the diamond is a categorical variable because it describes the level of flaws in the diamond using categories like 'SI1', 'VS1', 'VVS2', etc.


These visualizations will show that each of these variables contains a limited number of distinct categories, which is a characteristic of categorical variables.

In [71]:
import plotly.express as px

# Assuming 'df' is your dataframe and it has been loaded correctly from the provided CSV file

# Interactive bar plot for 'cut'
cut_counts = df['cut'].value_counts().reset_index()
cut_counts.columns = ['cut', 'count']  # Rename the columns appropriately
fig = px.bar(cut_counts, x='cut', y='count')
fig.update_layout(title_text='Distribution of Cut Quality', xaxis_title='Cut', yaxis_title='Frequency')
fig.show()

# Interactive bar plot for 'color'
color_counts = df['color'].value_counts().reset_index()
color_counts.columns = ['color', 'count']  # Rename the columns appropriately
fig = px.bar(color_counts, x='color', y='count')
fig.update_layout(title_text='Distribution of Diamond Color', xaxis_title='Color', yaxis_title='Frequency')
fig.show()

# Interactive bar plot for 'clarity'
clarity_counts = df['clarity'].value_counts().reset_index()
clarity_counts.columns = ['clarity', 'count']  # Rename the columns appropriately
fig = px.bar(clarity_counts, x='clarity', y='count')
fig.update_layout(title_text='Distribution of Diamond Clarity', xaxis_title='Clarity', yaxis_title='Frequency')
fig.show()

In [72]:
CATEGORICAL_COLS = ["cut", "color", "clarity"]

In [73]:
from typing import List
from sklearn.feature_extraction import DictVectorizer

def encode_cols(df: pd.DataFrame, categorical_cols: List[str] = None) -> pd.DataFrame:
    if categorical_cols is None:
        categorical_cols = ["cut", "color", "clarity"]
        
    df[categorical_cols] = df[categorical_cols].apply(lambda x: x.astype(str).str.lower())
    return df


def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> dict:
    if categorical_cols is None:
         categorical_cols = ["cut", "color", "clarity"]
    dicts = df[[*categorical_cols]].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["price"].values

    x = dv.transform(dicts)
    return x, y, dv

# save the preprocessor into saved_pkl folder
import pickle
def save_picked(path: str, file):
    with open(path, "wb") as f:
        pickle.dump(file, f)

In [74]:
from sklearn.model_selection import train_test_split
# try all steps
df = load_data('data/clean_diamonds_final.csv')
df.to_csv("data/clean_diamonds_final.csv", index=False)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_csv("data/train-set.csv")
test_df.to_csv("data/test-set.csv")
train_df = encode_cols(train_df)
test_df = encode_cols(test_df)
X_train, y_train, dv = extract_x_y(train_df)
X_test, y_test, _ = extract_x_y(test_df, dv=dv)

In [75]:
# !pip3 install xgboost

In [76]:
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRegressor  
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from typing import List

# Set the experiment name
mlflow.set_experiment("diamonds_price_predictor")

# Check if there's an active run, and end it if necessary
if mlflow.active_run():
    mlflow.end_run()

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("experiment_id", run_id)

    # Load data - assuming you have functions to load your data
    train_df = load_data("data/train-set.csv")
    test_df = load_data("data/test-set.csv")

    # Preprocess the text data
    train_text = train_df[['cut', 'color', 'clarity']].apply(lambda x: ' '.join(x), axis=1).tolist()
    test_text = test_df[['cut', 'color', 'clarity']].apply(lambda x: ' '.join(x), axis=1).tolist()

    # Vectorize the text data
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_text)
    X_test = vectorizer.transform(test_text)

    # Check the number of features
    print(f'Number of features in X_train: {X_train.shape[1]}')
    print(f'Number of features in X_test: {X_test.shape[1]}')

    y_train = train_df['price']
    y_test = test_df['price']

    # Convert data into ndarrays
    train_x = X_train.toarray()
    test_x = X_test.toarray()

    # Convert y_train and y_test to ndarrays
    train_y = y_train.values
    test_y = y_test.values


    # Train a regression model - changed to XGBoost (XGBRegressor)
    model = XGBRegressor()  # Instantiate XGBRegressor
    model.fit(X_train, y_train)

    # Evaluate the model
    y_train_pred = model.predict(X_train)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)

    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("train_mse", train_mse)
    mlflow.log_metric("train_r2", train_r2)

    print("Train MAE:", train_mae)
    print("Train MSE:", train_mse)
    print("Train R2:", train_r2)

    y_test_pred = model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_mse", test_mse)
    mlflow.log_metric("test_r2", test_r2)

    print("Test MAE:", test_mae)
    print("Test MSE:", test_mse)
    print("Test R2:", test_r2)

    # Log the model - XGBoost model
    mlflow.sklearn.log_model(model, "model")

    # Register the model in MLflow Model Registry
    model_name = "diamond_price_predictor_xgb01"  # Update model name accordingly
    model_description = "XGBoost Diamond Price Predictor"  # Update model description accordingly
    mlflow.register_model("runs:/{}/model".format(run_id), model_name)


Number of features in X_train: 13
Number of features in X_test: 13
Train MAE: 2938.0621926925623
Train MSE: 15435546.372673258
Train R2: 0.03946505739787176
Test MAE: 2871.045854619715
Test MSE: 14795993.84256705
Test R2: 0.029266127874316705


Registered model 'diamond_price_predictor_xgb01' already exists. Creating a new version of this model...
Created version '11' of model 'diamond_price_predictor_xgb01'.


In [77]:
from mlflow.tracking import MlflowClient

# Initialize MLflow tracking client
client = MlflowClient()

# Set the correct model type and experiment path
model_type = "xgboost"  # Update model type
mlflow_experiment_path = 'diamond_price_predictor_xgb'  # Adjusted experiment path for XGBoost model

# Specify the version of the model to be transitioned
production_version = 1

# Transition the specified model version to the "Production" stage
client.transition_model_version_stage(name=mlflow_experiment_path, version=production_version, stage="Production")


``mlflow.tracking.client.MlflowClient.transition_model_version_stage`` is deprecated since 2.9.0. Model registry stages will be removed in a future major release. To learn more about the deprecation of model registry stages, see our migration guide here: https://mlflow.org/docs/2.9.2/model-registry.html#migrating-from-stages



<ModelVersion: aliases=[], creation_timestamp=1707696922219, current_stage='Production', description=None, last_updated_timestamp=1712158674361, name='diamond_price_predictor_xgb', run_id='6e7014fd909d4375b4c364d6225f8979', run_link=None, source='file:///Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/mlruns/779128597837821808/6e7014fd909d4375b4c364d6225f8979/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [78]:
!mlflow ui --host 0.0.0.0 --port 5002

* 'schema_extra' has been renamed to 'json_schema_extra'
[2024-04-03 17:37:55 +0200] [59593] [INFO] Starting gunicorn 21.2.0
[2024-04-03 17:37:55 +0200] [59593] [INFO] Listening at: http://0.0.0.0:5002 (59593)
[2024-04-03 17:37:55 +0200] [59593] [INFO] Using worker: sync
[2024-04-03 17:37:55 +0200] [59594] [INFO] Booting worker with pid: 59594
[2024-04-03 17:37:55 +0200] [59595] [INFO] Booting worker with pid: 59595
[2024-04-03 17:37:55 +0200] [59596] [INFO] Booting worker with pid: 59596
[2024-04-03 17:37:55 +0200] [59597] [INFO] Booting worker with pid: 59597
^C
[2024-04-03 17:38:01 +0200] [59593] [INFO] Handling signal: int
[2024-04-03 17:38:01 +0200] [59594] [INFO] Worker exiting (pid: 59594)
[2024-04-03 17:38:01 +0200] [59596] [INFO] Worker exiting (pid: 59596)
[2024-04-03 17:38:01 +0200] [59595] [INFO] Worker exiting (pid: 59595)
[2024-04-03 17:38:01 +0200] [59597] [INFO] Worker exiting (pid: 59597)


In [79]:
import pickle

def save_pickle(file, path):
    """
    Save the file using pickle.
    
    Parameters:
        file: Any - The object to be saved.
        path: str - The path to save the file.
    """
    with open(path, "wb") as f:
        pickle.dump(file, f)

# Example usage:
save_pickle(model, "/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/Model_v/model.pkl")
save_pickle(dv, "/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/dv_v/dv.pkl")

In [80]:
# from config import PATH_TO_MODEL, PATH_TO_PREPROCESSOR
# Load production model
model_uri = f"models:/{mlflow_experiment_path}/production"
model = mlflow.sklearn.load_model(model_uri)
save_picked("/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/Model_v/model.pkl", model)

def load_pickle(path):
    with open(path, "rb") as f:
        file = pickle.load(f)
    return file

dv = load_pickle("/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/dv_v/dv.pkl")
model = load_pickle("/Users/mohammedzaidsyed/Desktop/Diamond/MLOPS_Diamond/Model_savedpkl/Model_v/model.pkl")


``mlflow.tracking.client.MlflowClient.get_latest_versions`` is deprecated since 2.9.0. Model registry stages will be removed in a future major release. To learn more about the deprecation of model registry stages, see our migration guide here: https://mlflow.org/docs/2.9.2/model-registry.html#migrating-from-stages



In [81]:
!mlflow server --host 127.0.0.1 --port 8080

* 'schema_extra' has been renamed to 'json_schema_extra'
[2024-04-03 17:38:11 +0200] [59617] [INFO] Starting gunicorn 21.2.0
[2024-04-03 17:38:11 +0200] [59617] [INFO] Listening at: http://127.0.0.1:8080 (59617)
[2024-04-03 17:38:11 +0200] [59617] [INFO] Using worker: sync
[2024-04-03 17:38:11 +0200] [59618] [INFO] Booting worker with pid: 59618
[2024-04-03 17:38:11 +0200] [59619] [INFO] Booting worker with pid: 59619
[2024-04-03 17:38:11 +0200] [59620] [INFO] Booting worker with pid: 59620
[2024-04-03 17:38:11 +0200] [59621] [INFO] Booting worker with pid: 59621
^C
[2024-04-03 17:41:16 +0200] [59617] [INFO] Handling signal: int
[2024-04-03 17:41:16 +0200] [59621] [INFO] Worker exiting (pid: 59621)
[2024-04-03 17:41:16 +0200] [59618] [INFO] Worker exiting (pid: 59618)
[2024-04-03 17:41:16 +0200] [59620] [INFO] Worker exiting (pid: 59620)
[2024-04-03 17:41:16 +0200] [59619] [INFO] Worker exiting (pid: 59619)
