In [None]:
import numpy as np
import pandas as pd
import math
from math import sqrt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [None]:
df = pd.read_csv('winequality-red.csv')
print("Shape of the CSV file:", df.shape)
df.head()

Shape of the CSV file: (1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [None]:
df['good_quality'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)
df.drop('quality', axis=1, inplace=True)
print(df)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067   

      free sulfur dioxide  

In [None]:
print("\nMissing values in the dataset:")
print(df.isnull().sum())


Missing values in the dataset:
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
good_quality            0
dtype: int64


In [None]:
correlation_matrix = df.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)


Correlation Matrix:
                      fixed acidity  volatile acidity  citric acid  \
fixed acidity              1.000000         -0.256131     0.671703   
volatile acidity          -0.256131          1.000000    -0.552496   
citric acid                0.671703         -0.552496     1.000000   
residual sugar             0.114777          0.001918     0.143577   
chlorides                  0.093705          0.061298     0.203823   
free sulfur dioxide       -0.153794         -0.010504    -0.060978   
total sulfur dioxide      -0.113181          0.076470     0.035533   
density                    0.668047          0.022026     0.364947   
pH                        -0.682978          0.234937    -0.541904   
sulphates                  0.183006         -0.260987     0.312770   
alcohol                   -0.061668         -0.202288     0.109903   
good_quality               0.120061         -0.270712     0.214716   

                      residual sugar  chlorides  free sulfur dioxide

In [None]:
from scipy.stats import zscore

z_scores = zscore(df)
abs_z_scores = abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]

# Normalizing data using StandardScaler
scaler = StandardScaler()
df_=df['good_quality']
df_scaled = pd.DataFrame(scaler.fit_transform(df.drop('good_quality', axis=1)), columns=df.columns[:-1])

# Display the normalized dataframe
print("\nNormalized DataFrame:")
print(df_scaled.head())


Normalized DataFrame:
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0      -0.522732          1.063300    -1.353987       -0.652772  -0.183440   
1      -0.267721          2.147925    -1.353987        0.904079   1.270957   
2      -0.267721          1.424841    -1.138966        0.236857   0.874303   
3       1.899877         -1.467493     1.656300       -0.652772  -0.249549   
4      -0.522732          1.063300    -1.353987       -0.652772  -0.183440   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0            -0.433282             -0.288643  0.726041  1.344308  -0.634982   
1             1.165126              1.002858  0.120457 -0.895273   0.397037   
2             0.023406              0.494085  0.241574 -0.461805   0.139032   
3             0.251750              0.728903  0.847158 -1.184251  -0.462979   
4            -0.433282             -0.288643  0.726041  1.344308  -0.634982   

    alcohol  
0 -1.011944  
1 -0.

In [None]:
X = df_scaled
y = df_

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# (iii) Apply Decision Trees Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predictions on the test set
dt_predictions = dt_model.predict(X_test)

# Evaluate Decision Trees Classifier
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_f1 = f1_score(y_test, dt_predictions)

print("Decision Trees Classifier:")
print(f"Accuracy Score: {dt_accuracy:.4f}")
print(f"F1 Score: {dt_f1:.4f}")

# (iv) Apply Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate Random Forest Classifier
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

print("\nRandom Forest Classifier:")
print(f"Accuracy Score: {rf_accuracy:.4f}")
print(f"F1 Score: {rf_f1:.4f}")


Decision Trees Classifier:
Accuracy Score: 0.8831
F1 Score: 0.6133

Random Forest Classifier:
Accuracy Score: 0.8992
F1 Score: 0.5763
