In [32]:
import pandas as pd
import numpy as np

In [33]:
df = pd.read_excel("winequality-red.xlsx")

In [34]:
df.head()

  and should_run_async(code)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [35]:
df.columns

  and should_run_async(code)


Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb


# Load and map the dataset
label_mapping = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5}
df['quality_mapped'] = df['quality'].map(label_mapping)

# Split the dataset
X = df.drop(columns=['quality', 'quality_mapped'])
y = df['quality_mapped']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert splits to DataFrames
train_df = X_train.copy()
train_df['quality'] = y_train

test_df = X_test.copy()
test_df['quality'] = y_test


In [37]:
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=6, random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate XGBoost model
y_pred = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred))


XGBoost Accuracy: 0.696875
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00        10
           2       0.75      0.80      0.78       130
           3       0.68      0.73      0.70       132
           4       0.64      0.55      0.59        42
           5       0.00      0.00      0.00         5

    accuracy                           0.70       320
   macro avg       0.34      0.35      0.34       320
weighted avg       0.67      0.70      0.68       320



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
!pip install gymnasium



In [41]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

class WineQualityEnv(gym.Env):
    def __init__(self, data, model):
        super(WineQualityEnv, self).__init__()
        self.data = data.reset_index(drop=True)
        self.model = model
        self.state_index = 0

        # Define observation space (wine features + XGBoost prediction)
        num_features = data.shape[1] - 1  # Exclude 'quality'
        # Modified to include the XGBoost prediction in the observation space
        self.observation_space = spaces.Box(
            low=np.concatenate([np.float32(data.drop(columns=['quality']).min().values), [0]]),  # Assuming prediction range starts at 0
            high=np.concatenate([np.float32(data.drop(columns=['quality']).max().values), [5]]), # Assuming prediction range ends at 5
            dtype=np.float32
        )

        # Define action space (quality levels)
        self.action_space = spaces.Discrete(6)

    def reset(self, seed=None):
        """Reset the environment to the initial state."""
        super().reset(seed=seed)
        self.state_index = 0
        state = self._get_state()  # Get the state
        # Return the state and an empty dictionary for info
        return state, {}

    def _get_state(self):
        """Get the current state."""
        if self.state_index < len(self.data):
            row = self.data.iloc[self.state_index]
            features = row.drop('quality').values
            xgb_prediction = self.model.predict([features])[0]
            observation = np.append(features, xgb_prediction).astype(np.float32)
            return observation
        return None

    def step(self, action):
      """Take an action and return the results."""
      if self.state_index >= len(self.data):
          return None, 0, True, False, {}  # Episode is terminated

      row = self.data.iloc[self.state_index]
      true_quality = row['quality']

      # Assign reward
      reward = 1 if action == true_quality else -1
      self.state_index += 1

      # Check if the episode is done
      terminated = self.state_index >= len(self.data) # Renamed done to terminated
      truncated = False
      next_state = self._get_state()
      return next_state, reward, terminated, truncated, {}

In [42]:
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env

# Split train and test data into environments
train_env = WineQualityEnv(train_df, xgb_model)
test_env = WineQualityEnv(test_df, xgb_model)

# Check custom environment
check_env(train_env)

# Train the RL Agent
dqn_model = DQN("MlpPolicy", train_env, verbose=1, learning_rate=0.001, gamma=0.95, buffer_size=50000)
dqn_model.learn(total_timesteps=10000)

# Evaluate the RL Agent
def evaluate_agent(env, model, n_episodes=10):
    total_rewards = []  # To store rewards for each episode
    correct_actions = 0  # Count of correct predictions
    total_actions = 0  # Total number of actions taken by the agent

    for episode in range(n_episodes):
        obs, _ = env.reset()  # Reset the environment and get the initial state
        episode_reward = 0  # Initialize the reward for the episode
        done = False

        while not done:
            # The RL model predicts the next action
            action, _ = model.predict(obs, deterministic=True)

            # Perform the action in the environment
            obs, reward, done, _, _ = env.step(action)
            episode_reward += reward  # Add the reward for this action

            # Track total actions and correct actions
            total_actions += 1
            if reward == 1:  # Reward of 1 indicates a correct action
                correct_actions += 1

        total_rewards.append(episode_reward)  # Store the total reward for the episode

    # Calculate and print average reward
    avg_reward = np.mean(total_rewards)
    print(f"Average Reward per Episode: {avg_reward:.2f}")

    # Calculate and print accuracy
    accuracy = (correct_actions / total_actions) * 100
    print(f"Hybrid Model Accuracy: {accuracy:.2f}%")



  gym.logger.warn(
  gym.logger.warn(
  gym.logger.warn(
  gym.logger.warn(


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.28e+03 |
|    ep_rew_mean      | 238      |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 300      |
|    time_elapsed     | 17       |
|    total_timesteps  | 5116     |
| train/              |          |
|    learning_rate    | 0.001    |
|    loss             | 0.28     |
|    n_updates        | 1253     |
----------------------------------


In [43]:
# Evaluate the hybrid model
evaluate_agent(test_env, dqn_model, n_episodes=10)


Average Reward per Episode: 126.00
Hybrid Model Accuracy: 69.69%
