In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [18]:
#load the data
combined_wine = pd.read_csv('combined_wine.csv')

In [19]:
#copy of the data
wine_data = combined_wine.copy()

In [20]:
# Add a synthetic price column based on quality (for demonstration purposes)
wine_data['price'] = wine_data['quality'] * 5 + np.random.normal(0, 2, len(wine_data))
wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,price
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,24.841535
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,28.473459
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,26.029514
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,27.833498
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,19.705086


In [21]:

# Features and target
X = wine_data[['quality', 'alcohol', 'volatile acidity', 'pH', 'type']]  # Include quality and other features
y = wine_data['price']  # Target variable: price

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessor
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ['quality', 'alcohol', 'volatile acidity', 'pH']),  # Scale numeric features
    ('cat', OneHotEncoder(), ['type'])  # Encode categorical features
])

# Define the pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),  # Apply preprocessing
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))  # Use Random Forest Regressor
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation results
print(f"Model Evaluation:")
print(f"Mean Absolute Error (MAE): ${mae:.2f}")
print(f"R² Score: {r2:.2f}")


# Add default values for other features required by the model
wine_data['alcohol'] = 12.0  # Default alcohol content
wine_data['volatile acidity'] = 0.4  # Default volatile acidity
wine_data['pH'] = 3.3  # Default pH
wine_data['type'] = 'red'  # Default wine type

# Predict prices for each unique quality
wine_data['predicted_price'] = pipeline.predict(wine_data)

# Display the results
print("Predicted Prices for Each Wine Quality:")
print(wine_data[['quality', 'predicted_price']])

Model Evaluation:
Mean Absolute Error (MAE): $1.71
R² Score: 0.79
Predicted Prices for Each Wine Quality:
      quality  predicted_price
0           5        25.897963
1           5        25.897963
2           5        25.897963
3           6        31.017931
4           5        25.897963
...       ...              ...
6492        6        31.017931
6493        5        25.897963
6494        6        31.017931
6495        7        36.423223
6496        6        31.017931

[6497 rows x 2 columns]
