In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
#load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
df = pd.read_csv(url,sep=';')

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [6]:
x= df.drop('quality' , axis=1)
y=df['quality']

In [7]:
#train test split
X_train, X_test, y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
#training multiple models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

#train linear regresssion
lr =LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)


#train Random Forest
rf= RandomForestRegressor()
rf.fit(X_train, y_train)
rf_pred= rf.predict(X_test)


#train XGBoost
xgb = XGBRegressor(n_estimators = 100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)


In [9]:
#evaluation

def evaluate_model(name, y_true, y_pred):
  print(f"\n{name} Evaluation:")
  print("MSE:", mean_squared_error(y_true , y_pred))
  print("R2_score:", r2_score(y_true , y_pred))


In [10]:
evaluate_model("Linear Regression", y_test, lr_pred)
evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("XGBoost", y_test, xgb_pred)



Linear Regression Evaluation:
MSE: 0.39002514396395416
R2_score: 0.4031803412796231

Random Forest Evaluation:
MSE: 0.30688875000000004
R2_score: 0.5303963298913612

XGBoost Evaluation:
MSE: 0.3425526022911072
R2_score: 0.47582316398620605


In [13]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Downloading mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading databricks_sdk-0.56.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentele

In [14]:
!pip install mlflow pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.11-py3-none-any.whl.metadata (9.4 kB)
Downloading pyngrok-7.2.11-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.11


In [15]:
import mlflow
import mlflow.sklearn

In [16]:
#manaul logging example
from sklearn.metrics import mean_squared_error , r2_score

with mlflow.start_run(run_name="Random_forest-run"):
  rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
  rf.fit(X_train, y_train)
  preds = rf.predict(X_test)

  #metrics
  mse=mean_squared_error(y_test, preds)
  r2_score= r2_score(y_test, preds)

  mlflow.log_metric("mse", mse)
  mlflow.log_metric("r2_score", r2_score)

  mlflow.sklearn.log_model(rf,"random_forest_model")

  print(f"Logged Metrics : MSE ={mse}, R2={r2_score}")




Logged Metrics : MSE =0.36751452069813606, R2=0.4376262807350808


In [17]:
#!pip install mlflow pyngrok


!ngrok config add-authtoken 2yMN71VZpfZnirjAcZVpEpEYNNe_5bfjpZX5SwrSyW3NQhDgA

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [18]:
# Start MLflow UI server in the background on port 5000
get_ipython().system_raw("mlflow ui --port 5000 &")


In [19]:
from pyngrok import ngrok
import os

# Set MLflow tracking URI
os.environ["MLFLOW_TRACKING_URI"] = "http://127.0.0.1:5000"

# Connect Ngrok tunnel
mlflow_tunnel = ngrok.connect(5000)
print("MLflow UI:", mlflow_tunnel.public_url)


MLflow UI: https://8ef0-34-16-234-156.ngrok-free.app


## Push To github

In [20]:
!pip freeze > requirements.txt