In [1]:
import pandas as pd
from pathlib import Path

# File to Load (Remember to Change These)
wine_data = Path("Resources/WineData_raw.csv")
city_data = Path("Resources/CityData_raw.csv")

In [2]:
wine_df = pd.read_csv(wine_data)
wine_df.head(10)

Unnamed: 0.1,Unnamed: 0,country,points,price,province,title,variety,winery
0,0,Italy,87,,Sicily & Sardinia,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,87,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,87,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,87,13.0,Michigan,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,5,Spain,87,15.0,Northern Spain,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
6,6,Italy,87,16.0,Sicily & Sardinia,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
7,7,France,87,24.0,Alsace,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach
8,8,Germany,87,12.0,Rheinhessen,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel
9,9,France,87,27.0,Alsace,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [3]:
# Dropping null values from dataframe
wine_df.dropna(inplace=True)
#Deleting Unnamed column
del wine_df ["Unnamed: 0"]

In [4]:
wine_df.head()

Unnamed: 0,country,points,price,province,title,variety,winery
1,Portugal,87,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,87,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,87,13.0,Michigan,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,87,15.0,Northern Spain,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem


In [5]:
# Define a function to remove special characters
def remove_special_characters(df):
    # Using regex to replace special characters with empty string
    df = df.replace(r'[^A-Za-z0-9 ]+', '', regex=True)
    return df

# Apply the function to the dataframe
cleaned_wine_df = remove_special_characters(wine_df)

print(cleaned_wine_df)

         country  points  price        province  \
1       Portugal      87   15.0           Douro   
2             US      87   14.0          Oregon   
3             US      87   13.0        Michigan   
4             US      87   65.0          Oregon   
5          Spain      87   15.0  Northern Spain   
...          ...     ...    ...             ...   
129966   Germany      90   28.0           Mosel   
129967        US      90   75.0          Oregon   
129968    France      90   30.0          Alsace   
129969    France      90   32.0          Alsace   
129970    France      90   21.0          Alsace   

                                                    title            variety  \
1             Quinta dos Avidagos 2011 Avidagos Red Douro     Portuguese Red   
2             Rainstorm 2013 Pinot Gris Willamette Valley         Pinot Gris   
3       St Julian 2013 Reserve Late Harvest Riesling L...           Riesling   
4       Sweet Cheeks 2012 Vintners Reserve Wild Child ...         P

In [6]:
# Renaming "province" column to "region"
cleaned_wine_df = cleaned_wine_df.rename(columns={
    "province": "region"})

In [7]:
cleaned_wine_df.head()

Unnamed: 0,country,points,price,region,title,variety,winery
1,Portugal,87,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red Douro,Portuguese Red,Quinta dos Avidagos
2,US,87,14.0,Oregon,Rainstorm 2013 Pinot Gris Willamette Valley,Pinot Gris,Rainstorm
3,US,87,13.0,Michigan,St Julian 2013 Reserve Late Harvest Riesling L...,Riesling,St Julian
4,US,87,65.0,Oregon,Sweet Cheeks 2012 Vintners Reserve Wild Child ...,Pinot Noir,Sweet Cheeks
5,Spain,87,15.0,Northern Spain,Tandem 2011 Ars In Vitro TempranilloMerlot Nav...,TempranilloMerlot,Tandem


In [8]:
# Create a unique regionID for each unique region, and a wineID for each wine title
cleaned_wine_df['regionID'] = cleaned_wine_df['region'].astype('category').cat.codes
cleaned_wine_df['wineID'] = cleaned_wine_df['title'].astype('category').cat.codes

In [9]:
cleaned_wine_df.head()

Unnamed: 0,country,points,price,region,title,variety,winery,regionID,wineID
1,Portugal,87,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red Douro,Portuguese Red,Quinta dos Avidagos,108,82440
2,US,87,14.0,Oregon,Rainstorm 2013 Pinot Gris Willamette Valley,Pinot Gris,Rainstorm,266,82837
3,US,87,13.0,Michigan,St Julian 2013 Reserve Late Harvest Riesling L...,Riesling,St Julian,216,93497
4,US,87,65.0,Oregon,Sweet Cheeks 2012 Vintners Reserve Wild Child ...,Pinot Noir,Sweet Cheeks,266,95364
5,Spain,87,15.0,Northern Spain,Tandem 2011 Ars In Vitro TempranilloMerlot Nav...,TempranilloMerlot,Tandem,260,96092


In [10]:
# Reorder the dataframe
wine_order = ['title', 'wineID', 'variety', 'winery', 'region', 'regionID', 'country', 'points', 'price']

ordered_wine_df = cleaned_wine_df[wine_order]

In [11]:
ordered_wine_df

Unnamed: 0,title,wineID,variety,winery,region,regionID,country,points,price
1,Quinta dos Avidagos 2011 Avidagos Red Douro,82440,Portuguese Red,Quinta dos Avidagos,Douro,108,Portugal,87,15.0
2,Rainstorm 2013 Pinot Gris Willamette Valley,82837,Pinot Gris,Rainstorm,Oregon,266,US,87,14.0
3,St Julian 2013 Reserve Late Harvest Riesling L...,93497,Riesling,St Julian,Michigan,216,US,87,13.0
4,Sweet Cheeks 2012 Vintners Reserve Wild Child ...,95364,Pinot Noir,Sweet Cheeks,Oregon,266,US,87,65.0
5,Tandem 2011 Ars In Vitro TempranilloMerlot Nav...,96092,TempranilloMerlot,Tandem,Northern Spain,260,Spain,87,15.0
...,...,...,...,...,...,...,...,...,...
129966,Dr H Thanisch Erben MllerBurggraef 2013 Braune...,36491,Riesling,Dr H Thanisch Erben MllerBurggraef,Mosel,230,Germany,90,28.0
129967,Citation 2004 Pinot Noir Oregon,25847,Pinot Noir,Citation,Oregon,266,US,90,75.0
129968,Domaine Gresser 2013 Kritt Gewurztraminer Alsace,33057,Gewrztraminer,Domaine Gresser,Alsace,11,France,90,30.0
129969,Domaine Marcel Deiss 2012 Pinot Gris Alsace,33498,Pinot Gris,Domaine Marcel Deiss,Alsace,11,France,90,32.0


In [12]:
# Checking the data types
ordered_wine_df.dtypes

title        object
wineID        int32
variety      object
winery       object
region       object
regionID      int16
country      object
points        int64
price       float64
dtype: object

In [13]:
# Convert 'regionID', 'wineID and 'price' columns to int64 to match SQL ERD
ordered_wine_df['regionID'] = ordered_wine_df['regionID'].astype('int64')
ordered_wine_df['price'] = ordered_wine_df['price'].astype('int64')
ordered_wine_df['wineID'] = ordered_wine_df['wineID'].astype('int64')

In [14]:
ordered_wine_df.dtypes

title       object
wineID       int64
variety     object
winery      object
region      object
regionID     int64
country     object
points       int64
price        int64
dtype: object

In [15]:
ordered_wine_df.head()

Unnamed: 0,title,wineID,variety,winery,region,regionID,country,points,price
1,Quinta dos Avidagos 2011 Avidagos Red Douro,82440,Portuguese Red,Quinta dos Avidagos,Douro,108,Portugal,87,15
2,Rainstorm 2013 Pinot Gris Willamette Valley,82837,Pinot Gris,Rainstorm,Oregon,266,US,87,14
3,St Julian 2013 Reserve Late Harvest Riesling L...,93497,Riesling,St Julian,Michigan,216,US,87,13
4,Sweet Cheeks 2012 Vintners Reserve Wild Child ...,95364,Pinot Noir,Sweet Cheeks,Oregon,266,US,87,65
5,Tandem 2011 Ars In Vitro TempranilloMerlot Nav...,96092,TempranilloMerlot,Tandem,Northern Spain,260,Spain,87,15


In [16]:
city_df = pd.read_csv(city_data)
city_df.head(10)

Unnamed: 0,City,Lat,Lng,Avg_Temp_2023,Total_Prcp_2023
0,Douro,44.3834,-78.1995,8.15,889.5
1,Oregon,44.0001,-120.5014,,
2,Michigan,44.2503,-85.5003,8.718182,751.0
3,Alsace,48.5,7.5,12.983333,535.4
4,California,38.3004,-76.5074,15.381818,780.2
5,Mosel,50.7833,12.4667,10.633333,753.8
6,Mendoza Province,-34.5,-68.5,16.658333,337.5
7,Virginia,37.5481,-77.4467,16.175,1105.1
8,Beaujolais,29.7858,-90.7681,22.072727,1172.4
9,Bordeaux,44.8404,-0.5805,15.133333,1111.4


In [28]:
# Removing null values from dataframe
city_df.dropna(inplace=True)

In [29]:
# Renaming columns in the dataframe
city_df = city_df.rename(columns={
    "City": "region",
    "Avg_Temp_2023": "average temp (celsius)",
    "Total_Prcp_2023": "total precipitation (mm)"})

In [30]:
city_df.head()

Unnamed: 0,region,Lat,Lng,average temp (celsius),total precipitation (mm),regionID
0,Douro,44.3834,-78.1995,8.15,889.5,108.0
2,Michigan,44.2503,-85.5003,8.718182,751.0,216.0
3,Alsace,48.5,7.5,12.983333,535.4,11.0
4,California,38.3004,-76.5074,15.381818,780.2,51.0
5,Mosel,50.7833,12.4667,10.633333,753.8,230.0


In [31]:
#Create a mapping of regionID to match region ID in our wine dataframe
region_to_regionID = dict(zip(cleaned_wine_df['region'], cleaned_wine_df['regionID']))

city_df['regionID'] = city_df['region'].map(region_to_regionID)

In [32]:
city_df.head()

Unnamed: 0,region,Lat,Lng,average temp (celsius),total precipitation (mm),regionID
0,Douro,44.3834,-78.1995,8.15,889.5,108
2,Michigan,44.2503,-85.5003,8.718182,751.0,216
3,Alsace,48.5,7.5,12.983333,535.4,11
4,California,38.3004,-76.5074,15.381818,780.2,51
5,Mosel,50.7833,12.4667,10.633333,753.8,230


In [33]:
# Reorder dataframe columns
city_order = ['region', 'regionID', 'Lat', 'Lng', 'average temp (celsius)', 'total precipitation (mm)']

ordered_city_df = city_df[city_order]

In [34]:
ordered_city_df.head()

Unnamed: 0,region,regionID,Lat,Lng,average temp (celsius),total precipitation (mm)
0,Douro,108,44.3834,-78.1995,8.15,889.5
2,Michigan,216,44.2503,-85.5003,8.718182,751.0
3,Alsace,11,48.5,7.5,12.983333,535.4
4,California,51,38.3004,-76.5074,15.381818,780.2
5,Mosel,230,50.7833,12.4667,10.633333,753.8


In [35]:
# Export cleaned data to Resources folder for further analysis
ordered_city_df.to_csv('Resources/CityData_cleaned.csv', index=False)
ordered_wine_df.to_csv('Resources/WineData_cleaned.csv', index=False)