In [24]:
import yaml
import os

In [25]:
# Data wrangling
import pandas as pd
import numpy as np

In [26]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm

In [27]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [28]:
# Load the configuration file
with open('../config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# Access the settings
project_folder = config['projectFolder']
df_path = os.path.join(project_folder, config['normalizedDataFile'])
df = pd.read_csv(df_path)

# Display the first few rows
df.head()

Unnamed: 0,PressureC1_diff,FlowC1,Temp1,Yield
0,-0.007811,1.484445,0.634587,69.400623
1,-1.868888,2.151224,-0.11668,66.532666
2,-0.020794,1.54816,-1.190829,71.102193
3,0.03435,2.084094,-1.198368,69.793481
4,-0.039157,1.250497,-0.978822,71.489516


In [29]:
# Define features (X) and target variable (y)
X = df.drop('Yield', axis=1)  # Features (all columns except 'Job Offer')
y = df['Yield']  # Target variable

In [30]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the data
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

Training data shape: (202, 3)
Testing data shape: (51, 3)


In [31]:
def get_metrics(y_true, y_pred):

    dict_metrics = {
        'MSE': mean_squared_error(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2 Score': r2_score(y_true, y_pred),
        
    }
    return dict_metrics

In [32]:
# Train linear regression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

# Predict
y_pred = lin_model.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'MSE': 2.5915634060208155,
 'MAE': 1.1139055895144712,
 'R2 Score': -0.27054244969558505}

In [33]:
# Train ridge regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

# Predict
y_pred = ridge_model.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'MSE': 2.5848925313278217,
 'MAE': 1.1128539282187329,
 'R2 Score': -0.2672719800422638}

In [34]:
# Train lasso regression
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)

# Predict
y_pred = ridge_model.predict(X_test)

# Evaluate Metrics
metrics = get_metrics(y_test, y_pred)

# View Results
metrics

{'MSE': 2.5848925313278217,
 'MAE': 1.1128539282187329,
 'R2 Score': -0.2672719800422638}