# Exploratory Data Analysis of Wine Dataset

## Load and Inspect the Data

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

file_path = '../../preprocessed_winemag-data copy.csv'  
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,id,country,description,designation,points,price,province,title,variety,winery
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
1,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
2,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
3,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
4,6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo


## Data Description

In [3]:
df.describe()

Unnamed: 0,id,points,price
count,86148.0,86148.0,86148.0
mean,65008.434729,88.729907,37.556403
std,37531.085008,3.05198,36.390439
min,1.0,80.0,4.0
25%,32448.5,87.0,18.0
50%,65192.5,89.0,28.0
75%,97470.5,91.0,45.0
max,129970.0,100.0,2013.0


In [3]:
df.nunique()

id             86148
country           41
description    79477
designation    35750
points            21
price            330
province         402
title          78719
variety          630
winery         11762
dtype: int64

In [4]:
import pycountry_convert as pc

def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
    except:
        if country_name == 'US':
            return 'North America'
        elif country_name == 'England':
            return 'Europe'
        return 'Not found'  # Returns 'Not found' if the country name is not recognized. I checked and it recognizes all
    return country_continent_name

df['continent'] = df['country'].apply(country_to_continent)



In [5]:
df_treemap = pd.DataFrame(df.copy().groupby(['continent', 'country', 'province', 'winery']).size(), columns=['count'])

def df_to_nested_dict(df):
    result = {}
    for index, value in df.iterrows():
        # Navigate through the multi-index and assign the value
        d = result
        for level in index[:-1]:  # Go until the second last index level
            if level not in d:
                d[level] = {}
            d = d[level]
        # Assign the count to the last level
        d[index[-1]] = int(value['count'])
    return result

# Convert and print the nested dictionary
nested_dict = df_to_nested_dict(df_treemap)


In [18]:
df_treemap

Unnamed: 0,continent,country,province,winery,count
0,Africa,Morocco,Guerrouane,Bernard Magrez,3
1,Africa,Morocco,Morocco,Ouled Thaleb,5
2,Africa,Morocco,Zenata,Ouled Thaleb,8
3,Africa,South Africa,Bot River,Beaumont,1
4,Africa,South Africa,Bot River,Wildekrans,1
...,...,...,...,...,...
13404,South America,Uruguay,Uruguay,Familia Traversa,2
13405,South America,Uruguay,Uruguay,Garzón,7
13406,South America,Uruguay,Uruguay,Gimenez Mendez,1
13407,South America,Uruguay,Uruguay,Narbona,3


In [50]:
def build_hierarchy(df):
    result = {'name': 'Continents', 'children': []}
    for continent, continent_df in df.groupby('continent'):
        continent_dict = {'name': continent, 'children': []}
        for country, country_df in continent_df.groupby('country'):
            country_dict = {'name': country, 'children': []}
            for province, province_df in country_df.groupby('province'):
                total_province_count = province_df['count'].sum()
                province_dict = {'name': province, 'children': []}
                other_winery_count = 0

                for idx, row in province_df.iterrows():
                    if (row['count'] / total_province_count) >= 0.002:
                        winery_dict = {'name': row['winery'], 'value': row['count']}
                        province_dict['children'].append(winery_dict)
                    else:
                        other_winery_count += row['count']

                if other_winery_count > 0:
                    province_dict['children'].append({'name': 'Other wineries', 'value': other_winery_count})
                
                country_dict['children'].append(province_dict)
            continent_dict['children'].append(country_dict)
        result['children'].append(continent_dict)
    return result

# Generate the JSON structure
hierarchy_json = build_hierarchy(df_treemap)

In [51]:
hierarchy_json

{'name': 'Continents',
 'children': [{'name': 'Africa',
   'children': [{'name': 'Morocco',
     'children': [{'name': 'Guerrouane',
       'children': [{'name': 'Bernard Magrez', 'value': 3}]},
      {'name': 'Morocco', 'children': [{'name': 'Ouled Thaleb', 'value': 5}]},
      {'name': 'Zenata', 'children': [{'name': 'Ouled Thaleb', 'value': 8}]}]},
    {'name': 'South Africa',
     'children': [{'name': 'Bot River',
       'children': [{'name': 'Beaumont', 'value': 1},
        {'name': 'Wildekrans', 'value': 1}]},
      {'name': 'Breedekloof',
       'children': [{'name': 'Lions Drift', 'value': 2}]},
      {'name': 'Cape Agulhas', 'children': [{'name': 'Lomond', 'value': 1}]},
      {'name': 'Cape Peninsula',
       'children': [{'name': 'Cape Point Vineyards', 'value': 1}]},
      {'name': 'Cederberg', 'children': [{'name': 'Cederberg', 'value': 1}]},
      {'name': 'Coastal Region',
       'children': [{'name': 'A.A. Badenhorst Family Wines', 'value': 1},
        {'name': 'Avonda

In [52]:
# def create_hierarchy(data):
#     # Creating the root of the hierarchy
#     hierarchy = {"name": "Continents", "children": []}
    
#     # Grouping by the highest level: Continent
#     for continent, continent_group in data.groupby("continent"):
#         continent_dict = {"name": continent, "children": []}
        
#         # Grouping by the next level: Country
#         for country, country_group in continent_group.groupby("country"):
#             country_dict = {"name": country, "children": []}
            
#             # Grouping by the next level: Province
#             for province, province_group in country_group.groupby("province"):
#                 province_dict = {"name": province, "children": []}
                
#                 # Processing each Winery
#                 for index, row in province_group.iterrows():
                    
#                     winery_dict = {"name": index[2], "value": int(row["count"])}
#                     province_dict["children"].append(winery_dict)
#                     print(winery_dict)
                
#                 country_dict["children"].append(province_dict)
            
#             continent_dict["children"].append(country_dict)

#         hierarchy["children"].append(continent_dict)
    
#     return hierarchy

# # Generate the hierarchical JSON structure
# hierarchy_json = create_hierarchy(df_treemap)

In [53]:
import json

# Convert the hierarchy dictionary to a JSON string
json_data = json.dumps(hierarchy_json, indent=4)

# Define the file path
file_path = 'winery_hierarchy.json'

# Write the JSON string to a file
with open(file_path, 'w') as file:
    file.write(json_data)

print(f"Data saved to {file_path}")


Data saved to winery_hierarchy.json
