In [1]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import json

def separate_features_target(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return X, y

def identify_features(X):
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns
    return numerical_features, categorical_features

def handle_duplicates(df):
    df_no_duplicates = df.drop_duplicates()
    return df_no_duplicates  

def create_preprocessor(numerical_features,categorical_features):
    numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    return preprocessor

def create_pipeline(preprocessor, classifier):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

def train_pipeline(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

def evaluate_pipeline(pipeline, X_test, y_test):
    accuracy = pipeline.score(X_test, y_test)
    print(f'Model Accuracy: {accuracy}')
    return accuracy


def get_metrics(pipeline, X_test, y_test):
    from sklearn.metrics import accuracy_score,precision_score,recall_score,log_loss
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return {'accuracy': round(acc, 2), 'precision': round(prec, 2), 'recall': round(recall, 2)}



In [3]:
def create_experiment(experiment_name, metrics, pipeline, model_name,run_params=None):
    import mlflow
    from datetime import datetime
    mlflow.set_tracking_uri("http://localhost:5000") 

    mlflow.set_experiment(experiment_name)
    run_name=experiment_name+str(datetime.now().strftime("%d-%m-%y"))
    
    with mlflow.start_run(run_name=run_name):
        
        if not run_params == None:
            for param in run_params:
                mlflow.log_param(param, run_params[param])
            
        for metric in metrics:
            mlflow.log_metric(metric, metrics[metric])
        
        #mlflow.sklearn.log_model(pipeline, "ml_pipline_2")
        mlflow.sklearn.log_model(pipeline, "model", registered_model_name=experiment_name)
        #mlflow.sklearn.log_model(pipeline, "model")
        mlflow.set_tag("model", model_name)
            

def train_and_log_with_mlflow(file_path, target_column, classifier, run_params,experiment_name):

    # Read the dataset and perform initial data processing
    df = pd.read_csv(file_path)
    df = handle_duplicates(df)
    X, y = separate_features_target(df, target_column)
    numerical_features, categorical_features = identify_features(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the pipeline
    preprocessor = create_preprocessor(numerical_features, categorical_features)

    # Set the classifier parameters
    classifier.set_params(**run_params)

    pipeline = create_pipeline(preprocessor, classifier)
    train_pipeline(pipeline, X_train, y_train)
    model_name = classifier.__class__.__name__
    # Evaluate the pipeline
    metrics = get_metrics(pipeline, X_test, y_test)

    create_experiment(experiment_name, metrics, pipeline, model_name,run_params)


In [10]:
file_path = "train_data.csv"
target_column = "cid" 
classifier = RandomForestClassifier()
run_params = {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}
experiment_name = "mlflow_workinGGGG"

train_and_log_with_mlflow(file_path, target_column, classifier, run_params,experiment_name)

2024/02/02 10:29:06 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_workinGGGG' does not exist. Creating a new experiment.
Successfully registered model 'mlflow_workinGGGG'.
2024/02/02 10:29:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: mlflow_workinGGGG, version 1
Created version '1' of model 'mlflow_workinGGGG'.


In [12]:
#Adding exsisting model to registry by using run id
import mlflow
with mlflow.start_run(run_name='add_model') as run:
    result = mlflow.register_model(
        "runs:/66b0f7aacdd440b582ce2102cb2b5f55/model",
        "creating_empty_registry"
    )

Registered model 'creating_empty_registry' already exists. Creating a new version of this model...
2024/01/04 11:43:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: creating_empty_registry, version 1
Created version '1' of model 'creating_empty_registry'.


In [6]:
#While the method above creates an empty registered model with no version associated,
import mlflow
client = mlflow.tracking.MlflowClient()
client.create_registered_model("creating_empty_registry")

<RegisteredModel: aliases={}, creation_timestamp=1706849043036, description='', last_updated_timestamp=1706849043036, latest_versions=[], name='creating_empty_registry', tags={}>

In [11]:
df = pd.read_csv(file_path)
df = handle_duplicates(df)
X, y = separate_features_target(df, target_column)
numerical_features, categorical_features = identify_features(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
import mlflow.pyfunc
model_name = "mlflow_workinGGGG"
model_version = 1
model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{model_version}"
)
y_pred = model.predict(X_test)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0
 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0]


In [15]:
client = mlflow.tracking.MlflowClient()
client.transition_model_version_stage(
    name="mlflow_workinGGGG",
    version=1,
    stage="Production"
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1706849952191, current_stage='Production', description='', last_updated_timestamp=1706850177556, name='mlflow_workinGGGG', run_id='04e4b924142244c5a6d9c86e30a40ee3', run_link='', source='file:///./artifactSS/2/04e4b924142244c5a6d9c86e30a40ee3/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [16]:
import mlflow.pyfunc

model_name = "mlflow_workinGGGG"
stage = 'Production'

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)

y_pred = model.predict(X_test)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 1 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0
 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0]


  latest = client.get_latest_versions(name, None if stage is None else [stage])


##### to deploy model we have to set uri tracking basically where mlflow is running and we mlflow serve command
###### set MLFLOW_TRACKING_URI=http://localhost:5000
###### mlflow models serve --model-uri models:/mlflow_workinGGGG/Production -p 1234 --no-conda

In [55]:
import requests
# Convert DataFrame to a dictionary
data_dict = {"dataframe_records": X_test.to_dict(orient='records')}

# Set the endpoint URL
endpoint_url = "http://localhost:1234/invocations"

# Make the POST request
response = requests.post(endpoint_url, json=data_dict)

# Check the response
if response.status_code == 200:
    # Assuming the response is JSON, you can parse it
    result = response.json()
    print(result)
else:
    print(f"Request failed with status code: {response.status_code}")
    print("Response content:")
    print(response.text)

{'predictions': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,

In [9]:
import boto3
from botocore.exceptions import NoCredentialsError

def upload_to_minio(local_file, bucket_name, minio_endpoint, minio_access_key, minio_secret_key, minio_secure=True, s3_file_name=None):
    # Create an S3 client for MinIO
    s3 = boto3.client('s3', 
                      endpoint_url=minio_endpoint,
                      aws_access_key_id=minio_access_key,
                      aws_secret_access_key=minio_secret_key,
                      verify=minio_secure)

    try:
        # If s3_file_name is not provided, use the local file name
        if s3_file_name is None:
            s3_file_name = local_file.split("/")[-1]

        # Upload the file
        s3.upload_file(local_file, bucket_name, s3_file_name)
        print(f"File uploaded successfully to {bucket_name}/{s3_file_name}")

    except NoCredentialsError:
        print("Credentials not available")

# Replace with your MinIO details and file paths
minio_endpoint = 'http://localhost:9000'  # Replace with your MinIO server endpoint
minio_access_key = 'wEQOUzmCWt6iDBqopAvz'
minio_secret_key = 'w6YAl5WpXsV0jAyjbebhTujKHnr8CErYJcJO2iFpp34cAy2kGr18sQUZJP6JZvkga7VKD012YHl42aDb'
bucket_name = 'artifact-store'
local_file_path = './train_data.csv'
s3_file_name = 'train_data1.csv'  # Specify a valid S3 object name

# Upload the file to MinIO
upload_to_minio(local_file_path, bucket_name, minio_endpoint, minio_access_key, minio_secret_key, s3_file_name=s3_file_name)


S3UploadFailedError: Failed to upload ./train_data.csv to artifact-store/train_data1.csv: An error occurred (SignatureDoesNotMatch) when calling the PutObject operation: The request signature we calculated does not match the signature you provided. Check your key and signing method.