# loading  the white wine and red wine datasets

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np

print("hellow world")

hellow world


In [2]:
# Load the red and white wine datasets from CSV files
# 'red.csv' contains data about red wine and 'white.csv' contains data about white wine.
red_wine = pd.read_csv('red.csv')
white_wine = pd.read_csv('white.csv')


# Combine data from both files and add color column, fill NaN values wiht mean value

In [3]:
# Add a 'color' column to distinguish between wine types in the combined dataset
# Assign '0' for red wine and '1' for white wine to help in binary classification later
red_wine['color'] = 0  # 0 represents red wine
white_wine['color'] = 1  # 1 represents white wine

# Combine the red and white wine datasets into a single dataset for processing
# Ignore the original indices to create a new unified index
wine_data = pd.concat([red_wine, white_wine], ignore_index=True)

# Handle missing values by replacing them with the mean of their respective columns
# This ensures that missing data does not negatively affect the training of the model
wine_data.fillna(wine_data.mean(), inplace=True)

# Plot Histogram of data

In [None]:
# Iterate over each column (excluding 'color') to plot histograms
for column in wine_data.columns:
    if column != 'color':  # Skip the 'color' column
        plt.figure(figsize=(10, 6))
        
        # Plot the histogram for red wine
        red_wine[column].plot(kind='hist', bins=30, alpha=0.5, color='red', label='Red Wine')
        
        # Plot the histogram for white wine
        white_wine[column].plot(kind='hist', bins=30, alpha=0.5, color='blue', label='White Wine')
        
        # Add labels, title, and legend
        plt.title(f'Histogram of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.legend()
        
        # Save the plot as a PNG file
        plt.savefig(f'{column}_histogram.png')
        
        # Display the plot
        #plt.show()  #uncomment to see histograms here

# Prepare training data (X) and labels (Y), labels can be color or wine quality

In [12]:
# Define the target column for classification
# The target can either be 'color' for binary classification (red vs. white wine)
# or 'quality' for multi-class classification (wine quality on a numeric scale)
target_column = 'quality'  # Change to 'quality' if multi-class classification is desired

# Separate features (X) and target (y)
# The features are all columns except 'color' and 'quality', depending on the target
# The target is the column defined above, either 'color' or 'quality'
X = wine_data.drop(columns=['color', 'quality'])
y = wine_data[target_column]

# Standardize the features to bring them to the same scale
# This helps the neural network model to converge faster and more reliably
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets (80% train, 20% test)
# A random state is used for reproducibility of the results
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create Classifier and apply to test data

In [17]:
# Create and train the support vector machine (SVM) classifier
# We use an SVM with a radial basis function (RBF) kernel and a random state for reproducibility
svm = SVC(kernel='poly', random_state=42) # changed kernel to rbf from poly
svm.fit(X_train, y_train)

# Make predictions on the test set using the trained model
y_pred = svm.predict(X_test)

# Print basic information about the trained SVM
print("\nSVM Training Details:")
print(f"Number of support vectors for each class: {svm.n_support_}")
print(f"Indices of support vectors: {svm.support_}")


SVM Training Details:
Number of support vectors for each class: [  24  173 1517 1984  851  157    4]
Indices of support vectors: [  61  205  235 ... 2355 3051 3742]


# Compare predicted labels with grount truth labels 

In [18]:
# Calculate evaluation metrics for the model's performance
# Metrics include accuracy, precision, recall, and a confusion matrix
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display the calculated metrics to understand the model's performance
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.55
Precision: 0.55
Recall: 0.55

Confusion Matrix:
[[  1   0   2   3   0   0   0]
 [  2   2  20  19   0   0   0]
 [  0   0 221 179   2   0   0]
 [  0   1 119 462  15   0   0]
 [  0   0   3 177  34   1   0]
 [  0   0   0  27   9   0   0]
 [  0   0   0   1   0   0   0]]

Classification Report:
              precision    recall  f1-score   support

           3       0.33      0.17      0.22         6
           4       0.67      0.05      0.09        43
           5       0.61      0.55      0.58       402
           6       0.53      0.77      0.63       597
           7       0.57      0.16      0.25       215
           8       0.00      0.00      0.00        36
           9       0.00      0.00      0.00         1

    accuracy                           0.55      1300
   macro avg       0.39      0.24      0.25      1300
weighted avg       0.55      0.55      0.51      1300

