In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('winequality-red.csv')
#Problem 1
# Create a new column 'good_quality' with binary values (0 or 1)
df['good_quality'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)

# Drop the original 'quality' column
df.drop('quality', axis=1, inplace=True)

# Display the updated DataFrame
print(df)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
4               7.4             0.700         0.00             1.9      0.076   
...             ...               ...          ...             ...        ...   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1596            6.3             0.510         0.13             2.3      0.076   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067   

      free sulfur dioxide  

In [14]:
# Problem 2 Cleaning data
# Drop rows with missing values
df.dropna(inplace=True)
# Drop duplicate rows
df.drop_duplicates(inplace=True)
# Handling outliers 
# Apply outlier detection and removal to each numerical column
for column in df.select_dtypes(include='number').columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = (df[column] < lower_bound) | (df[column] > upper_bound)
    df = df[~outliers]

# Print the filtered data
print("Filtered Data:")
print(df)

Original Data:
      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0               7.4             0.700         0.00             1.9      0.076   
1               7.8             0.880         0.00             2.6      0.098   
2               7.8             0.760         0.04             2.3      0.092   
3              11.2             0.280         0.56             1.9      0.075   
5               7.4             0.660         0.00             1.8      0.075   
...             ...               ...          ...             ...        ...   
1593            6.8             0.620         0.08             1.9      0.068   
1594            6.2             0.600         0.08             2.0      0.090   
1595            5.9             0.550         0.10             2.2      0.062   
1597            5.9             0.645         0.12             2.0      0.075   
1598            6.0             0.310         0.47             3.6      0.067   

      free s

In [15]:
from sklearn.preprocessing import StandardScaler
# Select the columns you want to normalize (numeric columns)
numeric_columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar' , 'chlorides','free sulfur dioxide','total sulfur dioxide', 'density','pH','sulphates','alcohol']

# Create a StandardScaler object
scaler = StandardScaler()

# Fit and transform the selected columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Display the normalized DataFrame
print(df)

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0         -0.472111          0.972346    -1.310602       -0.644802  -0.205727   
1         -0.190742          2.059990    -1.310602        0.944759   1.320685   
2         -0.190742          1.334894    -1.086277        0.263519   0.904391   
3          2.200894         -1.565491     1.829938       -0.644802  -0.275109   
5         -0.472111          0.730647    -1.310602       -0.871882  -0.275109   
...             ...               ...          ...             ...        ...   
1593      -0.894164          0.488948    -0.861953       -0.644802  -0.760786   
1594      -1.316218          0.368099    -0.861953       -0.417722   0.765626   
1595      -1.527245          0.065975    -0.749791        0.036438  -1.177080   
1597      -1.527245          0.640010    -0.637629       -0.417722  -0.275109   
1598      -1.456902         -1.384217     1.325209        3.215560  -0.830168   

      free sulfur dioxide  

In [16]:
correlation_matrix = df.corr()

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
                      fixed acidity  volatile acidity  citric acid  \
fixed acidity              1.000000         -0.252886     0.630269   
volatile acidity          -0.252886          1.000000    -0.603405   
citric acid                0.630269         -0.603405     1.000000   
residual sugar             0.224448          0.022286     0.120686   
chlorides                  0.175252          0.097202     0.053683   
free sulfur dioxide       -0.140062         -0.057950    -0.056093   
total sulfur dioxide      -0.057713          0.044428     0.056887   
density                    0.624544          0.007888     0.306849   
pH                        -0.684223          0.209022    -0.445123   
sulphates                  0.146134         -0.274271     0.231390   
alcohol                   -0.043472         -0.171369     0.106898   
good_quality                    NaN               NaN          NaN   

                      residual sugar  chlorides  free sulfur dioxide 

In [20]:
# Problem 3 classifying using decision tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
target_column = 'good_quality'
# Split the data into training and testing sets
X = df.drop(target_column, axis=1)  # Features
y = df[target_column]  # Target variable

# Split the data into 70% training and 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#Creating a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training set
clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# You can also print other metrics, like classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       258

    accuracy                           1.00       258
   macro avg       1.00      1.00      1.00       258
weighted avg       1.00      1.00      1.00       258



In [21]:
from sklearn.ensemble import RandomForestClassifier
target_column = 'good_quality'

X = df.drop(target_column, axis=1)  # Features
y = df[target_column]  # Target variable

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# 5. Train the classifier on the training set
rf_classifier.fit(X_train, y_train)

# 6. Make predictions on the testing set
y_pred = rf_classifier.predict(X_test)

# 7. Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# You can also print other metrics, like classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       172

    accuracy                           1.00       172
   macro avg       1.00      1.00      1.00       172
weighted avg       1.00      1.00      1.00       172

