In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import mlflow
from sklearn.metrics import (
    accuracy_score,
    mean_absolute_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score,
)
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from datetime import date

today = date.today().__str__().replace("-", "")
experiment_name = f"salary_prediction_{today}"
experiment_id = None
try:
    experiment =mlflow.set_experiment(experiment_name)
    experiment_id = experiment.experiment_id
except:
    experiment_id = mlflow.create_experiment(experiment_name)
print(f"{experiment_id}")
mlflow.autolog()

2024/07/14 02:05:02 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


334702352494113998


In [20]:
dataset = pd.read_csv(
    r"C:\Users\Abhinav\Desktop\iitj_projects\Salary_prediction_end_to_end\artifacts\dataset.csv",
    low_memory=False,
)
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,remote_ratio,company_size,emp_residence_company_location
0,2023,Senior,Full Time,Others,85847,Full Remote,Large,ES_Others
1,2023,Middle,Contract,Others,30000,Full Remote,Small,US_US
2,2023,Middle,Contract,Others,25500,Full Remote,Small,US_US
3,2023,Senior,Full Time,Data Scientist,175000,Full Remote,Medium,CA_CA
4,2023,Senior,Full Time,Data Scientist,120000,Full Remote,Medium,CA_CA


In [21]:
cat_cols = [
    "work_year",
    "experience_level",
    "employment_type",
    "job_title",
    "remote_ratio",
    "company_size",
    "emp_residence_company_location",
]
num_cols = ["salary_in_usd"]

for col in cat_cols:
    dataset[col] = pd.Categorical(dataset[col])

for col in num_cols:
    dataset[col] = pd.to_numeric(dataset[col])

In [22]:
experience_level_map = {"Senior": 3, "Middle": 2, "Junior": 1, "Executive": 0}

company_size_map = {
    "Large": 2,
    "Medium": 1,
    "Small": 0,
}

dataset["experience_level"] = (
    dataset["experience_level"].map(experience_level_map).astype("int64")
)
dataset["company_size"] = dataset["company_size"].map(company_size_map).astype("int64")

In [23]:
dataset.dtypes

work_year                         category
experience_level                     int64
employment_type                   category
job_title                         category
salary_in_usd                        int64
remote_ratio                      category
company_size                         int64
emp_residence_company_location    category
dtype: object

In [24]:
X = dataset.drop(columns=["salary_in_usd"])
y = dataset[["salary_in_usd"]]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train.shape, X_test.shape

((2067, 7), (517, 7))

In [25]:
num_cols = X_train.select_dtypes(include="number").columns
cat_cols = X_train.select_dtypes(exclude="number").columns

preprocessor = ColumnTransformer(
    [("encoder", OneHotEncoder(handle_unknown="ignore"), cat_cols)]
)

In [26]:
models = {
    "Linear Regression": LinearRegression(),
    "decision Tree": DecisionTreeRegressor(),
    "random forest": RandomForestRegressor(),
}

with mlflow.start_run(experiment_id=experiment_id
                      ,log_system_metrics=True):
    for name, model in models.items():
        pipeline_steps = [("processor", preprocessor)]
        pipeline_steps.append(("regressor", model))
        reg = Pipeline(pipeline_steps)
        reg.fit(X_train, y_train)
        preds = reg.predict(X_test)
        print(name, r2_score(y_test, preds))

2024/07/14 02:05:03 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.
 - mlflow (current: 2.5.0, required: mlflow==2.14.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


Linear Regression 0.287209783594087


 - mlflow (current: 2.5.0, required: mlflow==2.14.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


decision Tree 0.14857780466311354


  return fit_method(estimator, *args, **kwargs)
 - mlflow (current: 2.5.0, required: mlflow==2.14.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/07/14 02:09:46 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2024/07/14 02:09:46 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


random forest 0.2164569990266173


In [None]:
def train(in_alpha, in_l1_ratio):
    import logging
    import warnings

    import numpy as np
    import pandas as pd
    from sklearn.linear_model import ElasticNet
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    from sklearn.model_selection import train_test_split

    import mlflow
    import mlflow.sklearn
    from mlflow.models import infer_signature

    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2

    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the wine-quality csv file from the URL
    csv_url = (
        "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
    )
    try:
        data = pd.read_csv(csv_url, sep=";")
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e
        )

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    # Set default values if no alpha is provided
    alpha = 0.5 if float(in_alpha) is None else float(in_alpha)

    # Set default values if no l1_ratio is provided
    l1_ratio = 0.5 if float(in_l1_ratio) is None else float(in_l1_ratio)

    # Useful for multiple runs (only doing one run in this sample notebook)
    with mlflow.start_run():
        # Execute ElasticNet
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        # Evaluate Metrics
        predicted_qualities = lr.predict(test_x)
        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Print out metrics
        print(f"Elasticnet model (alpha={alpha:f}, l1_ratio={l1_ratio:f}):")
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Infer model signature
        predictions = lr.predict(train_x)
        signature = infer_signature(train_x, predictions)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(lr, "model", signature=signature)

train(0.5,0.5)


 - mlflow (current: 2.5.0, required: mlflow==2.14.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 0.7931640229276851
  MAE: 0.6271946374319586
  R2: 0.10862644997792614


 - mlflow (current: 2.5.0, required: mlflow==2.14.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [28]:
import sklearn

sklearn.__version__

'1.5.1'