In [None]:
import pandas as pd
wine_data=pd.read_csv('/content/winequality-red.csv')

In [None]:
#wine quality threshold
good_quality_threshold = 7

#'quality' column
wine_data['is_good_quality'] = (wine_data['quality'] >= good_quality_threshold).astype(int)

#dropping the original 'quality' column
wine_data.drop('quality', axis=1, inplace=True)

In [None]:
#checking for missing values
missing_values = wine_data.isnull().sum()

In [None]:
#correlation matrix
correlation_matrix = wine_data.corr()

In [None]:
#using the IQR method to detect and remove outliers
Q1 = wine_data.quantile(0.25)
Q3 = wine_data.quantile(0.75)
IQR = Q3 - Q1
wine_data_filtered = wine_data[~((wine_data < (Q1 - 1.5 * IQR)) |(wine_data > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
#Using Min-Max Scaling for normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
wine_data_normalized = pd.DataFrame(scaler.fit_transform(wine_data_filtered), columns=wine_data_filtered.columns)

In [None]:
#Results
{
    "Missing Values": missing_values,
    "Correlation Matrix": correlation_matrix,
    "Number of Outliers Removed": wine_data.shape[0] - wine_data_filtered.shape[0],
    "Normalized Data Sample": wine_data_normalized.head()
}

{'Missing Values': fixed acidity           0
 volatile acidity        0
 citric acid             0
 residual sugar          0
 chlorides               0
 free sulfur dioxide     0
 total sulfur dioxide    0
 density                 0
 pH                      0
 sulphates               0
 alcohol                 0
 is_good_quality         0
 dtype: int64,
 'Correlation Matrix':                       fixed acidity  volatile acidity  citric acid  \
 fixed acidity              1.000000         -0.256131     0.671703   
 volatile acidity          -0.256131          1.000000    -0.552496   
 citric acid                0.671703         -0.552496     1.000000   
 residual sugar             0.114777          0.001918     0.143577   
 chlorides                  0.093705          0.061298     0.203823   
 free sulfur dioxide       -0.153794         -0.010504    -0.060978   
 total sulfur dioxide      -0.113181          0.076470     0.035533   
 density                    0.668047          0.02202

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# train and test sets split
X = wine_data_normalized.drop('is_good_quality', axis=1)
y = wine_data_normalized['is_good_quality']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Decision Trees Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)

In [None]:
# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)

In [None]:
# Evaluating
dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_f1_score = f1_score(y_test, dt_predictions)

rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_f1_score = f1_score(y_test, rf_predictions)

{
    "Decision Tree Classifier": {"Accuracy": dt_accuracy, "F1 Score": dt_f1_score},
    "Random Forest Classifier": {"Accuracy": rf_accuracy, "F1 Score": rf_f1_score}
}

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'Decision Tree Classifier': {'Accuracy': 1.0, 'F1 Score': 0.0},
 'Random Forest Classifier': {'Accuracy': 1.0, 'F1 Score': 0.0}}