### Task1 - Wine Quality Dataset

### Task 2: Data Preprocessing
##### 1. Objective: Learn how to preprocess data for machine learning models.
##### 2. Description:
###### - Load a given dataset.
###### - Perform data cleaning (handle missing values, remove duplicates, etc.).
###### - Normalize or standardize the data.
###### - Split the data into training and testing sets.

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
file_path = '/Users/riteshrohilla/Desktop/wine+quality/winequality-white.csv'
try:
    data = pd.read_csv(file_path, delimiter=';')
    print("Dataset loaded successfully")
except FileNotFoundError:
    print("File not found. Please check the file path.")
except pd.errors.ParserError:
    print("Error parsing the file. Please check the file format.")
except Exception as e:
    print(f"An error occurred: {e}")

Dataset loaded successfully


In [3]:
# Display initial dataset info
print("\nInitial data info:")
print(data.info())


Initial data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
None


In [4]:
# Handle missing values
missing_values_count = data.isnull().sum().sum()
if missing_values_count > 0:
    print(f"\nMissing values found: {missing_values_count}")
    data = data.dropna()
    print("Missing values removed.")
else:
    print("\nNo missing values found.")


No missing values found.


In [5]:
# Remove duplicates
duplicate_rows_count = data.duplicated().sum()
if duplicate_rows_count > 0:
    print(f"\nDuplicate rows found: {duplicate_rows_count}")
    data = data.drop_duplicates()
    print("Duplicate rows removed.")
else:
    print("\nNo duplicate rows found.")


Duplicate rows found: 937
Duplicate rows removed.


In [6]:
# Normalize or standardize the data
scaler = MinMaxScaler()
try:
    # Assuming all columns except the target column are numeric for normalization
    numeric_columns = data.select_dtypes(include='number').columns
    data[numeric_columns] = scaler.fit_transform(data[numeric_columns])
    print("\nData normalized successfully")
except Exception as e:
    print(f"\nAn error occurred during normalization: {e}")


Data normalized successfully


In [8]:
target_column = 'quality'
if target_column in data.columns:
    X = data.drop(columns=[target_column])
    y = data[target_column]

    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print("Data split into training and testing sets successfully")
    except Exception as e:
        print(f"An error occurred during train-test split: {e}")
else:
    print(f"Target column '{target_column}' not found in the dataset.")

Data split into training and testing sets successfully


### Task 3: Model Training and Evaluation
##### 1. Objective: Train and evaluate a machine learning model.
##### 2. Description:
###### - Choose a suitable algorithm (e.g., linear regression, decision tree, etc.).
###### - Train the model using the training dataset.
###### - Evaluate the model's performance using the testing dataset.
###### - Report the model's accuracy, precision, recall, and F1-score

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [11]:
# Load the dataset with the correct delimiter
file_path = '/Users/riteshrohilla/Desktop/wine+quality/winequality-white.csv'
try:
    data = pd.read_csv(file_path, delimiter=';')
    print("Dataset loaded successfully")
except FileNotFoundError:
    print("File not found. Please check the file path.")
except pd.errors.ParserError:
    print("Error parsing the file. Please check the file format.")
except Exception as e:
    print(f"An error occurred: {e}")

Dataset loaded successfully


In [12]:
# Display initial dataset info
print("Initial data info:")
print(data.info())

Initial data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
None


In [13]:
# Check the first few rows to ensure correct loading
print(data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   
3            7.2              0.23         0.32             8.5      0.058   
4            7.2              0.23         0.32             8.5      0.058   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0.45   
1                 14.0                 132.0   0.9940  3.30       0.49   
2                 30.0                  97.0   0.9951  3.26       0.44   
3                 47.0                 186.0   0.9956  3.19       0.40   
4                 47.0                 186.0   0.9956  3.19       0.40   

   alcohol  quality  
0      8.8        6  
1      9.5        6  
2     10.1        6 

In [14]:
# Handle missing values
missing_values_count = data.isnull().sum().sum()
if missing_values_count > 0:
    print(f"Missing values found: {missing_values_count}")
    data = data.dropna()
    print("Missing values removed.")
else:
    print("No missing values found.")

No missing values found.


In [15]:
# Remove duplicates
duplicate_rows_count = data.duplicated().sum()
if duplicate_rows_count > 0:
    print(f"Duplicate rows found: {duplicate_rows_count}")
    data = data.drop_duplicates()
    print("Duplicate rows removed.")
else:
    print("No duplicate rows found.")

Duplicate rows found: 937
Duplicate rows removed.


In [16]:
# Normalize the Data
scaler = MinMaxScaler()
try:
    data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    print("Data normalized successfully")
except Exception as e:
    print(f"An error occurred during normalization: {e}")

Data normalized successfully


In [17]:
# Assuming 'quality' is the name of the column you want to predict
target_column = 'quality'
if target_column in data.columns:
    X = data_normalized.drop(columns=[target_column])
    y = data[target_column]

    # Convert the target variable to integer categories
    y = y.astype(int)
    
    # Check the distribution of the target variable
    print("Target variable distribution:")
    print(y.value_counts())

    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        print("Data split into training and testing sets successfully")
    except Exception as e:
        print(f"An error occurred during train-test split: {e}")
else:
    print(f"Target column '{target_column}' not found in the dataset.")

Target variable distribution:
quality
6    1788
5    1175
7     689
4     153
8     131
3      20
9       5
Name: count, dtype: int64
Data split into training and testing sets successfully


In [18]:
# Instantiate the classifier
clf = DecisionTreeClassifier(random_state=42)

In [19]:
# Train the classifier
clf.fit(X_train, y_train)
print("Model training completed")


Model training completed


In [20]:
# Predict on the testing dataset
y_pred = clf.predict(X_test)

In [21]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.43757881462799497
Precision: 0.436958023902114
Recall: 0.43757881462799497
F1-Score: 0.43596999327010083


In [22]:
# Display a detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, zero_division=1))


Classification Report:

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.25      0.19      0.22        31
           5       0.48      0.44      0.46       235
           6       0.49      0.52      0.51       358
           7       0.32      0.35      0.33       138
           8       0.12      0.12      0.12        26
           9       1.00      0.00      0.00         1

    accuracy                           0.44       793
   macro avg       0.38      0.23      0.23       793
weighted avg       0.44      0.44      0.44       793



### Task 4: Model Tuning and Optimization
##### 1. Objective: Improve the performance of a machine learning model.
##### 2. Description:
###### - Perform hyperparameter tuning (e.g., grid search, random search).
###### - Implement cross-validation to ensure the model's robustness.
###### - Compare the performance of the tuned model with the original model.
###### - Discuss any improvements or observations.

In [36]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier


In [37]:
# Load the dataset
file_path = '/Users/riteshrohilla/Desktop/wine+quality/winequality-white.csv'
try:
    data = pd.read_csv(file_path, delimiter=';')
    print("Dataset loaded successfully")
except FileNotFoundError:
    print("File not found. Please check the file path.")
except pd.errors.ParserError:
    print("Error parsing the file. Please check the file format.")
except Exception as e:
    print(f"An error occurred: {e}")

Dataset loaded successfully


In [38]:
# Display initial dataset info
print("\nInitial data info:")
print(data.info())


Initial data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
None


In [39]:
# Handle missing values
missing_values_count = data.isnull().sum().sum()
if missing_values_count > 0:
    print(f"\nMissing values found: {missing_values_count}")
    data = data.dropna()
    print("Missing values removed.")
else:
    print("\nNo missing values found.")


No missing values found.


In [40]:
# Remove duplicates
duplicate_rows_count = data.duplicated().sum()
if duplicate_rows_count > 0:
    print(f"\nDuplicate rows found: {duplicate_rows_count}")
    data = data.drop_duplicates()
    print("Duplicate rows removed.")
else:
    print("\nNo duplicate rows found.")


Duplicate rows found: 937
Duplicate rows removed.


In [41]:
# Normalize the Data
scaler = MinMaxScaler()
try:
    data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    print("\nData normalized successfully")
except Exception as e:
    print(f"\nAn error occurred during normalization: {e}")


Data normalized successfully


In [42]:
# Split data into features (X) and target (y)
target_column = 'quality'
if target_column in data.columns:
    X = data_normalized.drop(columns=[target_column])
    y = data[target_column]
    
    # Convert the target variable to integer categories
    y = y.astype(int)
    
    # Check the distribution of the target variable
    print("\nTarget variable distribution:")
    print(y.value_counts())
    # Train-test split
    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        print("\nData split into training and testing sets successfully")
    except Exception as e:
        print(f"\nAn error occurred during train-test split: {e}")
else:
    print(f"\nTarget column '{target_column}' not found in the dataset.")


Target variable distribution:
quality
6    1788
5    1175
7     689
4     153
8     131
3      20
9       5
Name: count, dtype: int64

Data split into training and testing sets successfully


In [43]:
# Instantiate the classifier
clf = DecisionTreeClassifier(random_state=42)

In [44]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [46]:
# Perform Grid Search with Cross-Validation using StratifiedKFold
skf = StratifiedKFold(n_splits=3)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [48]:
# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score}")

Best Parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Cross-Validation Score: 0.51010101010101


In [49]:
# Train the classifier with the best parameters
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)
print("Tuned model training completed")

Tuned model training completed


In [50]:
# Predict on the testing dataset
y_pred = best_clf.predict(X_test)

In [51]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Accuracy: 0.5119798234552333
Precision: 0.5468505783057205
Recall: 0.5119798234552333
F1-Score: 0.47463570724819837


In [52]:
# Display a detailed classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, zero_division=1))


Classification Report:

              precision    recall  f1-score   support

           3       1.00      0.00      0.00         4
           4       1.00      0.00      0.00        31
           5       0.53      0.63      0.57       235
           6       0.51      0.65      0.57       358
           7       0.48      0.20      0.28       138
           8       1.00      0.00      0.00        26
           9       1.00      0.00      0.00         1

    accuracy                           0.51       793
   macro avg       0.79      0.21      0.20       793
weighted avg       0.55      0.51      0.47       793

