# Assignment 3 : Red wine quality


## Importing libraries and useful tools

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

## Loading the data

In [2]:
data = pd.read_csv("C:\\Users\\nimes\\OneDrive\\Desktop\\220907_Rohan Nimesh\\assignment_3\\winequality-red.csv")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Creating new column with binary values

In [3]:
data['is_good_quality'] = (data['quality'] >= 7).astype(int)
data = data.drop('quality', axis=1)
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,is_good_quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


## Performing data pre-processing

In [15]:
# Checking for missing values
missing_values = data.isnull().sum()

# Creating a correlation matrix
correlation_matrix = data.corr()
print(correlation_matrix)


                      fixed acidity  volatile acidity  citric acid  \
fixed acidity              1.000000         -0.256131     0.671703   
volatile acidity          -0.256131          1.000000    -0.552496   
citric acid                0.671703         -0.552496     1.000000   
residual sugar             0.114777          0.001918     0.143577   
chlorides                  0.093705          0.061298     0.203823   
free sulfur dioxide       -0.153794         -0.010504    -0.060978   
total sulfur dioxide      -0.113181          0.076470     0.035533   
density                    0.668047          0.022026     0.364947   
pH                        -0.682978          0.234937    -0.541904   
sulphates                  0.183006         -0.260987     0.312770   
alcohol                   -0.061668         -0.202288     0.109903   
is_good_quality            0.120061         -0.270712     0.214716   

                      residual sugar  chlorides  free sulfur dioxide  \
fixed acidity    

In [16]:
# Split the data into features (X) and target variable (y)
X = data.drop('is_good_quality', axis=1)
y = data['is_good_quality']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(pd.DataFrame(X_train_scaled, columns=X.columns).head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       0.169536         -1.721071     0.459303       -0.454282  -0.503192   
1       2.446067         -0.401957     1.841055       -0.384209  -0.354641   
2      -0.647680          0.037747    -0.001281        0.036226  -0.078761   
3      -1.114661          0.752267    -0.973624       -0.314137  -0.906402   
4      -0.472562          0.257600    -0.666568       -0.244064  -0.227312   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0             3.610972              0.909502 -0.401203  1.011807   1.226612   
1            -0.966099             -0.968038  1.099235 -2.106876   1.226612   
2            -0.381792              0.939785 -0.205494  0.492026   0.297271   
3             0.299899              0.031298 -1.020949  1.141752  -0.980573   
4             1.078975              0.394693 -0.259857 -0.222672  -0.399735   

    alcohol  
0  0.550057  
1 -0.205175  
2  0.550057  


##  Applying following classification algorithms on the given dataset :

    (i)  Decision Trees Classifier
    (ii) Random Forest Classifier
 

In [17]:
# Decision Trees Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train_scaled, y_train)
y_pred_dt = dt_classifier.predict(X_test_scaled)

# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_scaled, y_train)
y_pred_rf = rf_classifier.predict(X_test_scaled)


## Evaluation

In [18]:
accuracy_dt = accuracy_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

# results
print("Decision Trees Classifier:")
print(f"Accuracy: {accuracy_dt:.5f}")
print(f"F1 Score: {f1_dt:.5f}")

print("\nRandom Forest Classifier:")
print(f"Accuracy: {accuracy_rf:.5f}")
print(f"F1 Score: {f1_rf:.5f}")

Decision Trees Classifier:
Accuracy: 0.86042
F1 Score: 0.55629

Random Forest Classifier:
Accuracy: 0.88750
F1 Score: 0.55000
