My name is Robert S.

I have chosed the Wine quality dataset from https://archive.ics.uci.edu/dataset/186/wine+quality
--> Cortez, P., Cerdeira, A., Almeida, F., Matos, T., & Reis, J. (2009). Wine Quality [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C56S3T.

The project will process information related to wine quality

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("wine_quality/winequality-red.csv", sep = ';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
print("View existing columns:", df.columns.tolist())
print(f"Dataset shape: {df.shape}")

View existing columns: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
Dataset shape: (1599, 12)


In [4]:
def standardize_column_names(df):
    """
    Standardize column names to be in snake_case format

    This function trims white spaces, replaces white spaces by underscores and converts to lower case. 
    It is essential for reproductibility and to reduce processing errors later on

    Args:
        df (pd.DataFrame): Original data frame.

    Returns:
        pd.DataFrame: Data frame with modified columns.
    """
    df.columns = [col.strip().lower().replace(' ', '_').replace('(', '').replace(')', '') 
                  for col in df.columns]
    return df

def handle_data_integrity(df, drop_duplicates=True):
    """
    Check and fix data integrity (remove duplicates and missing data)

    The dataset can have duplicate rows. The function identifies and eliminates the duplicates to prevent bias in the analysis

    Args:
        df (pd.DataFrame): The original data frame.
        drop_duplicates (bool):If set to True, it removes duplicates. Implicit is set to True.

    Returns:
        pd.DataFrame: Data frame with modified rows.
    """
    # check for missing data (in my case, Iris is already cleaned but it is good to do this)
    if df.isnull().values.any():
        df = df.dropna()
        
    # handle duplicates
    if drop_duplicates:
        df = df.drop_duplicates()
        
    return df.reset_index(drop=True)

In [5]:
df_clean = (df
            .pipe(standardize_column_names)
            .pipe(handle_data_integrity))

# vizualize result
print("View existing columns:", df_clean.columns.tolist())
print(f"Dataset shape: {df_clean.shape}")
df_clean.head()

View existing columns: ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates', 'alcohol', 'quality']
Dataset shape: (1359, 12)


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5


In [6]:
def perform_eda_summary(df, target_column=None):
    """
    Performs a comprehensive Exploratory Data Analysis summary.
    
    This function provides:
    1. Basic shape and data type information.
    2. Descriptive statistics for numerical columns.
    3. Grouped analysis based on a target category (e.g., species).
    4. A correlation matrix to identify relationships between features.

    Args:
        df (pd.DataFrame): The cleaned DataFrame.
        target_column (str): The column to group by for categorical analysis.

    Returns:
        None: Prints the analysis to the console.
    """
    print("--- 1. DATASET OVERVIEW ---")
    print(f"Total Rows: {df.shape[0]} | Total Columns: {df.shape[1]}")
    print("\nColumn Data Types:")
    print(df.dtypes)
    
    print("\n--- 2. SUMMARY STATISTICS ---")
    # Using .describe().T for a cleaner, transposed view
    display(df.describe().T)
    
    if target_column and target_column in df.columns:
        print(f"\n--- 3. GROUPED ANALYSIS (by {target_column}) ---")
        # Calculating the mean for each feature per class
        display(df.groupby(target_column).mean())
    
    print("\n--- 4. CORRELATION MATRIX ---")
    # Calculating Pearson correlation for numerical features
    correlation = df.select_dtypes(include=['number']).corr()
    display(correlation)

In [7]:
# --- APPLYING THE FUNCTION ---
# Assuming 'df' is your cleaned DataFrame from Task 5
perform_eda_summary(df, target_column='species')

--- 1. DATASET OVERVIEW ---
Total Rows: 1599 | Total Columns: 12

Column Data Types:
fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
ph                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

--- 2. SUMMARY STATISTICS ---


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed_acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile_acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric_acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual_sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free_sulfur_dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total_sulfur_dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
ph,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0



--- 4. CORRELATION MATRIX ---


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
fixed_acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile_acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric_acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual_sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free_sulfur_dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total_sulfur_dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
ph,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397
