# Importing and preparing wine data

## Libraries and settings

In [24]:
# Libraries
import os
import re
import time
import fnmatch
import datetime
import numpy as np
import pandas as pd


# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Get current working directory
print(os.getcwd())

/Users/bastian/vs_code/wine_analysis


## Importing data

In [25]:
# Wechsle ins Hauptverzeichnis
os.chdir('../')  # Zurück aus dem "notebooks"-Ordner

# Read the data to a pandas data frame
df = pd.read_csv('./data/raw/wine.csv', sep=',', encoding='utf-8')

# Show first records of data frame
df.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/raw/wine.csv'

## Count number of rows and columns in the data frame

In [3]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (580, 7)
Number of rows: 580
Number of columns: 7


## Get data types (raw-format from web scraping)

In [4]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order         object
web-scraper-start-url     object
name_raw                  object
price_raw                float64
country_raw               object
quantity_raw              object
value_raw                 object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Extract country

In [5]:


# Function to extract the country from 'country_raw'
def extract_country(country_raw):
    if pd.isna(country_raw):  # Handle NaN values
        return None
    match = re.match(r'^[^,]+', country_raw)  # Matches everything before the first comma
    return match.group(0) if match else None

# Apply the extraction function to the 'country_raw' column
df['country'] = df['country_raw'].apply(extract_country)

# Check the results
print(df[['country_raw', 'country']].head())


     country_raw  country
0        Italien  Italien
1         Rimuss   Rimuss
2  Schweiz, 2023  Schweiz
3        Italien  Italien
4  Schweiz, 2023  Schweiz


### Get data types of all variables including the new ones

In [6]:
df.dtypes

web-scraper-order         object
web-scraper-start-url     object
name_raw                  object
price_raw                float64
country_raw               object
quantity_raw              object
value_raw                 object
country                   object
dtype: object

## Count and identify missing values (if any)

In [7]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)].head()

web-scraper-order        0
web-scraper-start-url    0
name_raw                 0
price_raw                0
country_raw              0
quantity_raw             0
value_raw                0
country                  0
dtype: int64


Unnamed: 0,web-scraper-order,web-scraper-start-url,name_raw,price_raw,country_raw,quantity_raw,value_raw,country


## Count and identify duplicated values (if any)

In [8]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['web-scraper-order', 'name_raw']].duplicated()]

0


Unnamed: 0,web-scraper-order,web-scraper-start-url,name_raw,price_raw,country_raw,quantity_raw,value_raw,country


## Create additional variables from the wine's name

### Change strings in 'name_raw' to uppercase 

In [9]:
# Change strings to uppercase 
df['name_raw'] = df['name_raw'].str.upper()
print(df['name_raw'].head(10), '\n')

0    NATURAPLAN BIO-PROSECCO DOC RAPHAEL DAL BO, EX...
1          RIMUSS PARTY TRAUBENSAFT WEISS, ALKOHOLFREI
2      ZÜRICH AOC STAATSSCHREIBER CUVÉE BLANC PRESTIGE
3    PROSECCO SUPERIORE DI VALDOBBIADENE CONEGLIANO...
4           AIGLE LES MURAILLES CHABLAIS AOC H. BADOUX
5                   CHAMPAGNE AOC CHARLES BERTIN, BRUT
6               PROSECCO DOC VIGNE DEI DOGI, EXTRA DRY
7                  ZÜRICH AOC FEDERWEISSER STAUFFACHER
8                 BLU SECCO DOSE VINO FRIZZANTE 6X20CL
9                     TOSCANA IGT ROSSO VILLA ANTINORI
Name: name_raw, dtype: object 



### Calculate lenght of strings in 'name_raw'

In [10]:
# Show first item of variable 'name_raw'
print(df['name_raw'][0])

# Lenght of the strings in 'name_raw'
df['name_raw_len'] = df['name_raw'].str.len()
print(df['name_raw_len'], '\n')

NATURAPLAN BIO-PROSECCO DOC RAPHAEL DAL BO, EXTRA DRY
0      53
1      43
2      47
3      79
4      42
       ..
575    36
576    49
577    35
578    56
579    48
Name: name_raw_len, Length: 580, dtype: int64 



### Create new binary (0/1) variable 'bio' from the name

In [11]:
# Create a pattern which can be used to search the variable 'name_raw'
pattern = 'BIO|NATURAPLAN'

# Create new variable 'luxurious' as binary dummy (0/1) variable
df['bio'] = df['name_raw'].str.contains(pat = pattern).astype(int)
print(df['bio'].sum())

# Show values
df[['name_raw', 'country_raw','bio']]

40


Unnamed: 0,name_raw,country_raw,bio
0,"NATURAPLAN BIO-PROSECCO DOC RAPHAEL DAL BO, EX...",Italien,1
1,"RIMUSS PARTY TRAUBENSAFT WEISS, ALKOHOLFREI",Rimuss,0
2,ZÜRICH AOC STAATSSCHREIBER CUVÉE BLANC PRESTIGE,"Schweiz, 2023",0
3,PROSECCO SUPERIORE DI VALDOBBIADENE CONEGLIANO...,Italien,0
4,AIGLE LES MURAILLES CHABLAIS AOC H. BADOUX,"Schweiz, 2023",0
...,...,...,...
575,BOURGOGNE AOC CHARDONNAY LOUIS JADOT,"Frankreich, 2022",0
576,VALAIS AOC ROSÉ OEIL DE PERDRIX LE ROSEL 6X 75CL,"Schweiz, 2023",0
577,LEGARIS CRIANZA RIBERA DEL DUERO DO,"Spanien, 2021",0
578,RIPASSO DELLA VALPOLICELLA DOC SUPERIORE VIGNE...,"Italien, 2022",0


### Create new binary (0/1) variable 'non_alcoholic' from the name

In [12]:
# Create a pattern which can be used to search the variable 'name_raw'
pattern = 'ALKOHOLFREI|OHNE ALKOHOL'

# Create new variable 'luxurious' as binary dummy (0/1) variable
df['non_alcoholic'] = df['name_raw'].str.contains(pat = pattern).astype(int)
print(df['non_alcoholic'].sum())

# Show values
df[['name_raw','non_alcoholic']]

40


Unnamed: 0,name_raw,non_alcoholic
0,"NATURAPLAN BIO-PROSECCO DOC RAPHAEL DAL BO, EX...",0
1,"RIMUSS PARTY TRAUBENSAFT WEISS, ALKOHOLFREI",1
2,ZÜRICH AOC STAATSSCHREIBER CUVÉE BLANC PRESTIGE,0
3,PROSECCO SUPERIORE DI VALDOBBIADENE CONEGLIANO...,0
4,AIGLE LES MURAILLES CHABLAIS AOC H. BADOUX,0
...,...,...
575,BOURGOGNE AOC CHARDONNAY LOUIS JADOT,0
576,VALAIS AOC ROSÉ OEIL DE PERDRIX LE ROSEL 6X 75CL,0
577,LEGARIS CRIANZA RIBERA DEL DUERO DO,0
578,RIPASSO DELLA VALPOLICELLA DOC SUPERIORE VIGNE...,0


### Create new binary (0/1) variable 'premium' from the name

In [13]:
# Create a pattern which can be used to search the variable 'name_raw'
pattern = 'PRESTIGE|EXKLUSIV|SUPERIORE|GRAND CRU|PREMIUM'

# Create new variable 'luxurious' as binary dummy (0/1) variable
df['premium'] = df['name_raw'].str.contains(pat = pattern).astype(int)
print(df['premium'].sum())

# Show values
df[['name_raw','premium']]

80


Unnamed: 0,name_raw,premium
0,"NATURAPLAN BIO-PROSECCO DOC RAPHAEL DAL BO, EX...",0
1,"RIMUSS PARTY TRAUBENSAFT WEISS, ALKOHOLFREI",0
2,ZÜRICH AOC STAATSSCHREIBER CUVÉE BLANC PRESTIGE,1
3,PROSECCO SUPERIORE DI VALDOBBIADENE CONEGLIANO...,1
4,AIGLE LES MURAILLES CHABLAIS AOC H. BADOUX,0
...,...,...
575,BOURGOGNE AOC CHARDONNAY LOUIS JADOT,0
576,VALAIS AOC ROSÉ OEIL DE PERDRIX LE ROSEL 6X 75CL,0
577,LEGARIS CRIANZA RIBERA DEL DUERO DO,0
578,RIPASSO DELLA VALPOLICELLA DOC SUPERIORE VIGNE...,1


### Create new categorical variable 'Wine Categories' based on name_raw

In [14]:
# Define patterns for each category in uppercase
patterns = {
    'ROTWEIN': 'ROT|ROSSO|CABERNET|MERLOT|PINOT NOIR|SHIRAZ|MALBEC|ZINFANDEL|TEMPRANILLO|BAROLO|RIOJA|CHIANTI|NERO D\'AVOLA|SANGIOVESE|GRENACHE|CARMENERE|SYRAH',
    'WEISSWEIN': 'WEISS|CHARDONNAY|BLANC|BIANCO|BLANCHE|RIESLING|PINOT GRIGIO|GEWÜRZTRAMINER|VERDEJO|SAUVIGNON|MÜLLER-THURGAU|GRÜNER VELTLINER|ALBARIÑO|TREBBIANO',
    'SCHAUMWEIN': 'CHAMPAGNER|PROSECCO|CAVA|CRÉMANT|FRANCIACORTA|SEKT|SCHAUMWEIN|RIMUSS|ASTI|SPUMANTE|MOUSSEUX|BRUT|EXTRA DRY|SECO|SECCO|FRIZZANTE|SPARKLING|MOSCATO',
    'ROSÉWEIN': 'ROSÉ|PROVENÇAL ROSÉ|ZINFANDEL ROSÉ|CERASUOLO|ROSATO|BLUSH|VIN GRIS|PINK',
    'GLÜHWEIN': 'GLÜHWEIN|PUNSCH|HEISSER|WINTER|GEWÜRZT|MULLED|SPICED WINE'
}

# Create a new column 'wine_category' initialized with 'UNCATEGORIZED'
df['wine_category'] = 'UNCATEGORIZED'

# Assign categories based on patterns
for category, pattern in patterns.items():
    df.loc[df['name_raw'].str.contains(pattern, case=False, na=False), 'wine_category'] = category

# Print count of each category
print(df['wine_category'].value_counts())

# Show the first few rows with the new category column
df[['name_raw', 'wine_category']].head()




wine_category
SCHAUMWEIN       220
UNCATEGORIZED    160
WEISSWEIN        100
ROTWEIN           40
GLÜHWEIN          40
ROSÉWEIN          20
Name: count, dtype: int64


Unnamed: 0,name_raw,wine_category
0,"NATURAPLAN BIO-PROSECCO DOC RAPHAEL DAL BO, EX...",SCHAUMWEIN
1,"RIMUSS PARTY TRAUBENSAFT WEISS, ALKOHOLFREI",SCHAUMWEIN
2,ZÜRICH AOC STAATSSCHREIBER CUVÉE BLANC PRESTIGE,WEISSWEIN
3,PROSECCO SUPERIORE DI VALDOBBIADENE CONEGLIANO...,SCHAUMWEIN
4,AIGLE LES MURAILLES CHABLAIS AOC H. BADOUX,UNCATEGORIZED


### Create new categorical variable based on wine prices

In [15]:
# Define price categories with three levels
def categorize_price_3(price):
    if price <= 10.00:
        return 'Budget'
    elif price <= 30.00:
        return 'Mid-Range'
    else:
        return 'Premium'

# Apply the function to create a new column
df['price_category'] = df['price_raw'].apply(categorize_price_3)

# Check the distribution of the categories
price_category_distribution = df['price_category'].value_counts()

# Display the distribution
price_category_distribution


price_category
Mid-Range    280
Budget       240
Premium       60
Name: count, dtype: int64

### Create new numeric variable 'quantity_in_cl'

In [16]:
# Extract the numeric value from the 'quantity' column and convert it to integer
df['quantity_in_cl'] = df['quantity_raw'].str.extract(r'(\d+)').astype(int)

# Display the updated DataFrame with the new column
print(df[['quantity_raw', 'quantity_in_cl']].head())


  quantity_raw  quantity_in_cl
0         75cl              75
1         70cl              70
2         75cl              75
3         75cl              75
4         70cl              70


### Create new numeric variable 'price_per_10cl'

In [17]:
# Extract the numeric value from the 'value' column and convert it to a float
df['price_per_10cl'] = df['value_raw'].str.extract(r'([\d\.]+)').astype(float)

# Display the first few rows to verify
df[['value_raw', 'price_per_10cl']].head()

Unnamed: 0,value_raw,price_per_10cl
0,1.59/10cl,1.59
1,0.60/10cl,0.6
2,1.69/10cl,1.69
3,1.46/10cl,1.46
4,3.21/10cl,3.21


### Rename columns

In [18]:
# Rename columns in the DataFrame
df.rename(columns={
    'name_raw': 'name',
    'price_raw': 'price',
    'quantity_raw': 'quantity',
    'value_raw': 'value'
}, inplace=True)

# Display the updated column names
df.columns


Index(['web-scraper-order', 'web-scraper-start-url', 'name', 'price',
       'country_raw', 'quantity', 'value', 'country', 'name_raw_len', 'bio',
       'non_alcoholic', 'premium', 'wine_category', 'price_category',
       'quantity_in_cl', 'price_per_10cl'],
      dtype='object')

### Country filter

In [19]:
# Show all unique countries
df['country'].unique()

#count country values
df['country'].value_counts()

country
Italien        200
Schweiz        140
Frankreich     120
Rimuss          40
Spanien         40
Portugal        20
Deutschland     20
Name: count, dtype: int64

### Remove rows with Country = Rimuss

In [20]:
# REmove all wines from Rimuss
df = df[df['country'] != 'Rimuss']

df['country'].value_counts()

country
Italien        200
Schweiz        140
Frankreich     120
Spanien         40
Portugal        20
Deutschland     20
Name: count, dtype: int64

### Add country code

In [21]:
# Create a mapping of country names to country codes
country_code_mapping = {
    'Italien': 'IT',
    'Schweiz': 'CH',
    'Frankreich': 'FR',
    'Spanien': 'ES',
    'Portugal': 'PT',
    'Deutschland': 'DE'
}

# Add a new column for country codes using the mapping
df['country_code'] = df['country'].map(country_code_mapping)

# Display the updated dataset with country codes
df[['country', 'country_code']].head()


Unnamed: 0,country,country_code
0,Italien,IT
2,Schweiz,CH
3,Italien,IT
4,Schweiz,CH
5,Frankreich,FR


### Save data to file

In [22]:
df.to_csv('wine_data_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- 

In [23]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 24.1.0
Datetime: 2025-01-12 14:49:50
Python Version: 3.9.6
-----------------------------------
