<a href="https://colab.research.google.com/github/santiago2588/distillation_column_training/blob/main/Soluciones_colab/03_baseline_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Data wrangling
import pandas as pd
import numpy as np

In [2]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

In [3]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [5]:
!wget https://raw.githubusercontent.com/santiago2588/distillation_column_training/main/data/transformed_normalized_data.csv -O transformed_normalized_data.csv

--2025-07-07 13:45:06--  https://raw.githubusercontent.com/santiago2588/distillation_column_training/main/data/transformed_normalized_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18167 (18K) [text/plain]
Saving to: ‘transformed_normalized_data.csv’


2025-07-07 13:45:06 (17.1 MB/s) - ‘transformed_normalized_data.csv’ saved [18167/18167]



In [6]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("transformed_normalized_data.csv")

# Display the first few rows to confirm it loaded correctly
df.head()

Unnamed: 0,PressureC1_diff,FlowC1,Temp1,Yield
0,-0.007811,1.484445,0.634587,69.400623
1,-1.868888,2.151224,-0.11668,66.532666
2,-0.020794,1.54816,-1.190829,71.102193
3,0.03435,2.084094,-1.198368,69.793481
4,-0.039157,1.250497,-0.978822,71.489516


In [7]:
# Define features (X) and target variable (y)
X = df.drop('Yield', axis=1)  # Features (all columns except 'Job Offer')
y = df['Yield']  # Target variable

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (202, 3)
Testing data shape: (51, 3)


In [9]:
def get_metrics(y_true, y_pred):

    dict_metrics = {
        'MSE': mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2 Score': r2_score(y_true, y_pred),

    }
    return dict_metrics

In [10]:
# Train linear regression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

# Predict
y_pred = lin_model.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'MSE': 2.5915634060208155,
 'MAE': 1.1139055895144714,
 'R2 Score': -0.27054244969558505}

In [11]:
# Train ridge regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Predict
y_pred = ridge_model.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'MSE': 2.5848925313278217,
 'MAE': 1.1128539282187329,
 'R2 Score': -0.2672719800422638}

In [12]:
# Train lasso regression
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

# Predict
y_pred = ridge_model.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'MSE': 2.5848925313278217,
 'MAE': 1.1128539282187329,
 'R2 Score': -0.2672719800422638}