# Fun with Lego

In this folder there is a CSV file with all Lego sets from 1950 to 2017. Your homework is to use this file to answer the questions below. Just as for the in-class exercise, you should be able to solve everything without any non-standard libraries (except for the optional matplotlib exercise), but you are more than welcome to use any other libraries.

Answer the following questions and put your answer in a dictionary, with the first word of each line (e.g. 'all_pieces' or 'year_most') as the key and your answer as the value. Your homework must also include the code that you used to find the answers.

- 'all_pieces': If you had one of each of the sets, how many pieces of Lego would you have then?
- 'year_most': In which year was the highest number of sets released?
- 'average_pieces': What is the average number of pieces in all sets, rounded to 1 decimal?
- 'most_used_word': Which word is used most often in the names of the sets?

You can find more information about the dataset here:
https://www.kaggle.com/rtatman/lego-database

Optional matplotlib exercise:
- Plot the years from 1950-2017 on the x-axis and the median number of pieces of a set of the given year on the y-axis.

Optional themes exercises (for these, you will also need the themes.csv file):
Each set is part of a theme, and each theme is also part of one or more parent themes. For example, the set 60141-1 is part of theme 80 (Police), which in turn is part of theme 67 (Classic Town), which again in turn is part of theme 50 (Town). Theme 50, however, is a parent theme, and therefore there are no other themes 'above' it.
- Create a dictionary with all parent themes as keys, and a list of all their sub-themes as values. Here, you should only discern between a parent theme and any subtheme. Thus, theme 50 would be a parent theme, and both theme 80 and 67 should be listed on the same level.
- Create a dictionary with all parent themes as keys and the number of the sets that are part of it. Here, you have to make sure that each set is only counted once!





In [None]:
import csv

file_path = 'legosets.csv'

with open(file_path) as file:
    reader = csv.reader(file)
    print(reader) # Note what this prints!
    header = next(reader)
    legosets = [row for row in reader]

In [None]:
header

In [None]:
# Create a dictionary for our solutions

solutions = {}

In [None]:
# Get all pieces

pieces = 0

for row in legosets:
    pieces += int(row[4])
    
solutions['all_pieces'] = pieces
solutions['all_pieces']

In [None]:
# Year with most releases
# I show 2 ways of doing this

# No imports:

years_all_sets = []

for row in legosets:
    years_all_sets.append(row[2])
    
print(years_all_sets)
print(max(years_all_sets)) # This just gives us the highest number, which is 2017!

max_year = 0
max_count = 0

for year in set(years_all_sets):
    this_year = years_all_sets.count(year)
    if this_year > max_count:
        max_count = this_year
        max_year = year

print(max_count)
print(max_year)

# (You could also iterate with a for-loop with range(1950, 2017+1), but then 
# what happens if we update the dataset with recent years?)

In [None]:
# with collections.Counter

from collections import Counter

set_counter = Counter(years_all_sets)

print('All years:')
print(set_counter)
print('###')

print(set_counter.most_common(1))

solutions['year_most'] = int(set_counter.most_common(1)[0][0])


print('###')
print(f'Year with most releases: {solutions["year_most"]}') 


In [None]:
solutions

In [None]:
# Average number of pieces

avg_pieces = round(solutions['all_pieces'] / len(legosets), 1)

solutions['average_pieces'] = avg_pieces

avg_pieces

In [None]:
# Most used word

# One way of doing it

all_words = ''

for row in legosets:
    all_words += row[1] + ' '
    
print(len(all_words))

word_list = all_words.split()

# Now we can use the same approach as before

word_counter = Counter(word_list)

print(word_counter.most_common(10))

two_most_common = word_counter.most_common(2)

solutions['most_used_word'] = two_most_common[1][0]

solutions

In [None]:
# matplotlib exercise:

# Median of pieces/year:



In [None]:
import statistics

In [None]:
# Create year:number of pieces/set dictionary
year_pieces_dict = {}
for year in set(years_all_sets):
    this_year = []
    for row in legosets:
        if row[2] == year:
            this_year.append(int(row[4]))
    year_pieces_dict[int(year)] = statistics.median(this_year)





In [None]:
for year in sorted(year_pieces_dict):
    print(f'Year: {year}, pieces: {year_pieces_dict[year]}')

In [None]:
import matplotlib.pyplot as plt

xpoints = sorted(year_pieces_dict)
ypoints = [year_pieces_dict[year] for year in sorted(year_pieces_dict)]


plt.plot(xpoints, ypoints)
plt.xlabel('Year')
plt.ylabel('Median number of pieces')
plt.yticks(range(0, 130, 10))
plt.show()

In [None]:
# Dict with parent themes:

# Create a dictionary with all parent themes as keys, and a list of all their sub-themes as values. 
# Here, you should only discern between a parent theme and any subtheme. Thus, theme 50 would be a parent theme, 
# and both theme 80 and 67 should be listed on the same level.

themes_file = 'themes.csv'

with open(themes_file) as file:
    reader = csv.reader(file)
    header = next(reader)
    themes = [row for row in reader]
    
header

In [None]:
parent_themes = []

for row in themes:
    if not row[2]:
        parent_themes.append([row[0], row[1]])
        
parent_themes

In [None]:
def get_child_themes(parent_id):
    child_themes = []
    def get_subtheme(theme_id):
        print(f'Function called with theme id {theme_id}')
        for row in themes:
            if row[2] == theme_id:
                print(f'Theme id: {theme_id}, row with theme: {row[0]}, {row[1]}')
                child_themes.append(row[1])
                get_subtheme(row[0])
                print('reached the end')
    get_subtheme(parent_id)
    return child_themes

In [None]:
all_subthemes = {}

In [None]:
test_themes = [parent_themes[0]]
for theme in parent_themes:
    all_subthemes[theme[1]] = get_child_themes(theme[0])
    
    
    

In [None]:
all_subthemes

In [None]:
# Get number of sets of each main theme:

# Helper functions:

# Wrong solution - returns first instance of a word (take a look at 'Construction')
def get_theme_ids(theme):
    def get_id(subtheme):
        for row in themes:
            if row[1] == subtheme:
                return row[0]
    theme_ids = [get_id(theme)]
    subthemes = all_subthemes[theme]
    for subtheme in subthemes:
        theme_ids.append([subtheme, get_id(subtheme)])
    return theme_ids

for theme in all_subthemes:
    print(get_theme_ids(theme))
    print('\n\n###')

In [None]:
# Modified implementation of the recursive function used above:
def get_theme_ids(theme):
    for parent_theme in parent_themes:
        if theme == parent_theme[1]:
            theme_id = parent_theme[0]
            break
    child_ids = [theme_id]
    def get_subtheme(theme_id):
        for row in themes:
            if row[2] == theme_id:
                child_ids.append(row[0])
                get_subtheme(row[0])
    get_subtheme(theme_id)
    return child_ids

In [None]:
for theme in all_subthemes:
    print(theme, get_theme_ids(theme))
    

In [None]:
# From here, the code is the same:

In [None]:
def set_is_in_subtheme(legoset, theme_ids):
    for set_id in theme_ids:
        if set_id == legoset[3]:
            return True
    return False
        

In [None]:
theme_nums = {}
set_counter = 0

for theme in all_subthemes:
    theme_ids = get_theme_ids(theme)
    num_sets = [legoset for legoset in legosets if set_is_in_subtheme(legoset, theme_ids)]
    print(theme)
    set_counter += len(num_sets)
    print(f'Number of sets: {set_counter}')
    print(num_sets)
    print('\n\n\n###')
    