My name is Robert S.

I have chosed the Wine quality dataset from https://archive.ics.uci.edu/dataset/186/wine+quality
--> Cortez, P., Cerdeira, A., Almeida, F., Matos, T., & Reis, J. (2009). Wine Quality [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C56S3T.

The project will process information related to wine quality

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
df = pd.read_csv("wine_quality/winequality-red.csv", sep = ';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [13]:
print("View existing columns:", df.columns.tolist())
print(f"Dataset shape: {df.shape}")

View existing columns: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']
Dataset shape: (1599, 12)


In [14]:
def standardize_column_names(df):
    """
    Standardize column names to be in snake_case format

    This function trims white spaces, replaces white spaces by underscores and converts to lower case. 
    It is essential for reproductibility and to reduce processing errors later on

    Args:
        df (pd.DataFrame): Original data frame.

    Returns:
        pd.DataFrame: Data frame with modified columns.
    """
    df.columns = [col.strip().lower().replace(' ', '_').replace('(', '').replace(')', '') 
                  for col in df.columns]
    return df

def handle_data_integrity(df, drop_duplicates=True):
    """
    Check and fix data integrity (remove duplicates and missing data)

    The dataset can have duplicate rows. The function identifies and eliminates the duplicates to prevent bias in the analysis

    Args:
        df (pd.DataFrame): The original data frame.
        drop_duplicates (bool):If set to True, it removes duplicates. Implicit is set to True.

    Returns:
        pd.DataFrame: Data frame with modified rows.
    """
    # check for missing data (in my case, Iris is already cleaned but it is good to do this)
    if df.isnull().values.any():
        df = df.dropna()
        
    # handle duplicates
    if drop_duplicates:
        df = df.drop_duplicates()
        
    return df.reset_index(drop=True)

In [15]:
df_clean = (df
            .pipe(standardize_column_names)
            .pipe(handle_data_integrity))

# vizualize result
print("View existing columns:", df_clean.columns.tolist())
print(f"Dataset shape: {df_clean.shape}")
df_clean.head()

View existing columns: ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar', 'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density', 'ph', 'sulphates', 'alcohol', 'quality']
Dataset shape: (1359, 12)


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,ph,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
