**Objective: Calculate Chocolate Ratings based on Broad Bean Origin**

I wanted to try using 2 different methods to solve for the objective: (1) using predominantly list (2) using predominantly dictionary

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Method 1 - Solving using list:**

In [None]:
# Read the data source
df = pd.read_csv('../input/chocolate-bar-ratings/flavors_of_cacao.csv')

# Look at the original dataset
df

In [None]:
# Testing

# Importing dataset
df = pd.read_csv('../input/chocolate-bar-ratings/flavors_of_cacao.csv')
# Insert into df_cuba when Broad Bean Origin column equals to Cuba
df_cuba = df[df['Broad Bean\nOrigin'] == 'Cuba']
# print the df out
df_cuba

In [None]:
## Importing and cleaning dataset

import re

df = pd.read_csv('../input/chocolate-bar-ratings/flavors_of_cacao.csv')
remove = [', ', ',', '. ', '&', '/', '\xa0', 'Â', 'and']

# Remove from df if col Broad Bean Origin contains element(i) from remove list
for i in remove:
    df = df[~df['Broad Bean\nOrigin'].str.contains(i, flags=re.I, regex = True, na = False)]

    
# General Overview:

# Count the total number of unique rating in the df
count_rating = len(df['Rating'].unique())
print('Count of Rating:',count_rating)
# Print out all the unique ratings across df
print(df['Rating'].unique())

In [None]:
from collections import Counter

def mp(country):
    df = pd.read_csv('../input/chocolate-bar-ratings/flavors_of_cacao.csv')
    df = df[df['Broad Bean\nOrigin'] == country]
    all_rating = []
    rating_weight = []
    keys = []
    total_weight = 0
    total_count = 0
    
    for row in df.itertuples(index=False):
        all_rating.append(row.Rating,)

    c = Counter(all_rating)
#     print('Key', 'Count', 'Weight')
    for key in all_rating:
        if key not in keys:
            rating_weight = key, c[key], c[key]*key
            keys += (key,)
            total_weight += rating_weight[2]
            total_count += rating_weight[1]
#             print(rating_weight)
    return(total_weight, total_count)

countries = ['India', 'Indonesia', 'Ghana', 'Peru', 'Mexico']

for country in countries:
    (total_weight, total_count) = mp(country)
    print('Total Weight:',total_weight)
    print('Total Count:',total_count)
    print('Average Chocolate Rating for', country,':', round(total_weight/total_count,2))
    print('\n')

**Method 2 - Solving using dictionary:**

In [None]:
from collections import defaultdict
from collections import Counter
import re

### Initialize dictionary to store the data

# Importing dataset
df2 = pd.read_csv('../input/chocolate-bar-ratings/flavors_of_cacao.csv')

# Cleaning
remove = [', ', ',', '. ', '&', '/', '\xa0', 'Â', 'and']

# Remove from df if col Broad Bean Origin contains element(i) from remove list
for i in remove:
    df2 = df2[~df2['Broad Bean\nOrigin'].str.contains(i, flags=re.I, regex = True, na = False)]

# Initialize dictionary to store ratings (key = origin, value = rating)
# automatically creates entries that don't exist when they are needed
# and initialize them with a default value; in this case, the default value is an empty list
# Ref: https://stackoverflow.com/a/64893091/14656169
data = defaultdict(list)

# For each row in the dataframe, insert the rating in its corresponding key in the dictionary
for row in df2.itertuples(index=True):
    index = row.Index
    origin = row._9
    row_rate = row.Rating
    data[origin].append(row_rate)

### Calculate Ratings (initializing for one country)

rate_weight = 0
total_rate_weight = 0
total_count = len(data['Cuba'])
print('|Rate |  Rate#  |Rate_Weight')

# Count all unique rating values
c = Counter(data['Cuba'])

# Iterate through each rating value
for key in c:
    rate_weight = key * c[key]
    total_rate_weight += rate_weight
    print(f'|{key}    |  {c[key]}     | {rate_weight}')
        
average_rate = total_rate_weight/total_count
print(f'Total Weight: {total_rate_weight}')
print(f'Total Count: {total_count}')
print(f'Average Rate for Cuba: {average_rate}')

Next, I will try to put the calculation in a function.

In [None]:
from collections import defaultdict
from collections import Counter
import re

df2 = pd.read_csv('../input/chocolate-bar-ratings/flavors_of_cacao.csv')
remove = [', ', ',', '. ', '&', '/', '\xa0', 'Â', 'and']
for i in remove:
    df2 = df2[~df2['Broad Bean\nOrigin'].str.contains(i, flags=re.I, regex = True, na = False)]

def rating_calc(country, data):
    """
    Perform calculation on each country to get the average rating
    
    Calculation performed:
    Rating Weight = each unique rating scale * rating scale count
    Total Count = total number of rating
    Average Rating = Rating Weight/Total Count
    
    returns Average Rating
    """
    rate_weight = 0
    total_rate_weight = 0
    total_count = len(data[country])

    # Count all unique rating values
    c = Counter(data[country])

    # Iterate through each rating value
    for key in c:
        rate_weight = key * c[key]
        total_rate_weight += rate_weight     
    average_rate = round(total_rate_weight/total_count,2)
    return average_rate

data = defaultdict(list)

# For each row in the dataframe, insert the rating in its corresponding key in the dictionary
for row in df2.itertuples(index=True):
    index = row.Index
    origin = row._9
    row_rate = row.Rating
    data[origin].append(row_rate)

ave_dict = defaultdict(list)
for bean_ori in data:
    rate_ave = rating_calc(bean_ori, data)
    ave_dict[bean_ori].append(rate_ave)
    print(f'Average Rating for {bean_ori}: {rate_ave}')

In [None]:
# https://stackoverflow.com/questions/8023306/get-key-by-value-in-dictionary (reference for top_country)
# Finding top Rated country

max_rate = max(ave_dict.values())
top_country = list(ave_dict.keys())[list(ave_dict.values()).index(max_rate)]

print(f'Top Rated Chocolate Bean Origin: {top_country} rated {max_rate}')

In [None]:
### To get Top 5 Rated countries

from operator import itemgetter

rate_list = []

# Iterate through ave_dict and store the average ratings of all countries to the rate_list
for element in ave_dict:
    rate_list.append((element, ave_dict[element]))

# Sorting the rate_list by the index 1 of each tuples    
sort_ave = sorted(rate_list, key=itemgetter(1))

# Printing Top 5 countries
print(sort_ave[-5:])

Conclusion:

This is my first ever kernel! :D
It is bare, it is basic... Bird by bird as Anne Lamott puts it :')

An improvement idea that I could think of as of now:
* Perhaps add a user input for printing out the top Nth countries
* Visualization to look at the rank
* I only used 2 columns in this kernel: Broad Bean Origin and Rating. Perhaps I could look at whether there is any correlation between ratings and other attributes

Some of the comments contain notes and references for my own understanding and future revision.
Always looking for improvement, so if there's anything I should do/ could have write better, do let me know :)