In [None]:
import pandas as pd

# As the dataset is in a CSV file named 'winequality-red.csv'
file_path = 'winequality-red.csv'

# Load the dataset into a DataFrame
wine_data = pd.read_csv(file_path)

# Step (i): Creating a new column for binary classification
wine_data['good_quality'] = (wine_data['quality'] >= 7).astype(int)
wine_data.drop('quality', axis=1, inplace=True)

# Step (ii): Data Pre processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# Assuming 'X' contains features and 'y' contains the target variable 'good_quality'
X = wine_data.drop('good_quality', axis=1)
y = wine_data['good_quality']

# Drops rows with missing values
wine_data.dropna(inplace=True)

# Removes duplicate rows
wine_data.drop_duplicates(inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#Creating a correlation matrix
correlation_matrix = wine_data.corr()

# Display the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Dealing with outliers
# Using Z-score to remove outliers:
z_scores = (X_train - X_train.mean()) / X_train.std()
X_train_no_outliers = X_train[(z_scores < 3).all(axis=1)]

# Normalize data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_no_outliers)
X_test_scaled = scaler.transform(X_test)

# Step (iii): Apply Classification Algorithms
# a) Decision Trees Classifier:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

# Create a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=10)

# Fit the model on the training data
dt_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_dt = dt_model.predict(X_test_scaled)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
f1_score_dt = f1_score(y_test, y_pred_dt)

print("Decision Tree Model:")
print("Accuracy:", accuracy_dt)
print("F1 Score:", f1_score_dt)

# b)Random forest classifier:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest model
rf_model = RandomForestClassifier(random_state=10)

# Fit the model on the training data
rf_model.fit(X_train_scaled, y_train)

# Predict on the test data
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
f1_score_rf = f1_score(y_test, y_pred_rf)

print("Random Forest Model:")
print("Accuracy:", accuracy_rf)
print("F1 Score:", f1_score_rf)