This my solution for the project 'lego-analysis'.

#### Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#### Read the necessary files

In [None]:
df = pd.read_csv("datasets/lego_sets.csv")

#### Gathering some overall info about the data in the DataFrame

In [None]:
df.info()

#### Cleaning up the data (accordingly to requirements)

Okay, let's drop the rows with invalid (NaN) values

In [None]:
df = df[df["set_num"].notnull()]

NOTE: There are still missing values in the 'num_parts' column but since it is not critical to our analysis we can leave it as it is.

### Question 1: What percentage of all licensed sets ever released were Star Wars themed?

- So let's take a look at the `theme_name` and `parent_name` columns

In [None]:
# df[df['parent_theme'] == 'Star Wars']

Now, let's read out the **parent_themes.csv** file.

In [None]:
# let's call our new DataFrame 'license_df'
license_df = pd.read_csv("datasets/parent_themes.csv")

Now let's just merge the `licensed_df` DF with our main DF

In [None]:
# using the `.merge()` method
merged = df.merge(license_df, left_on="parent_theme", right_on="name")
# we drop the `name_y` column because we do not need it
merged.drop(columns="name_y", inplace=True)
# and we reset the `name_x` column to `name`
merged.rename(columns={"name_x": "name"}, inplace=True)
# to keep only the licensed sets
df = merged[merged["is_licensed"]]

Well, now let's get that percentage!

In [None]:
df

In [None]:
# so since all the listed sets are licensed
all_sets = df.shape[0]

In [None]:
star_wars_sets = df[df["parent_theme"] == "Star Wars"]
sw_count = star_wars_sets.shape[0]

In [None]:
sets_name = set(df["parent_theme"].to_list())

In [None]:
print(len(sets_name))

In [None]:
# they need the result as integer
the_force = int(((sw_count / all_sets) * 100))
the_force

### Question 2: In which year was Star Wars **not** the most popular licensed theme (in terms of number of sets released this year)?

In [None]:
# df.groupby('year')['parent_theme'].describe()

Let's create a new column 'Count'

In [None]:
df["Count"] = 1

In [None]:
count_df = df.groupby(["year", "parent_theme"]).sum().reset_index()

Okay now we have the total count for each set released by year. Let's find out how to take out the max.

In [None]:
count_df

We can use either one of the following methods.

In [None]:
# count_df.groupby(['year'], sort=False)['Count'].max()
# or
# idx = count_df.groupby(['year'], sort=False)['Count'].transform(max) == count_df['Count']
# count_df[idx]
# or
# count_df.sort_values('Count', ascending=False).drop_duplicates(['year'])
# or
# count_df.sort_values('Count', ascending=False).drop_duplicates(['year'], keep='first').reset_index()
# or
# count_df.sort_values('Count').groupby(['year']).tail(1).reset_index()

And finally save the result in a variable named `new_era`.

In [None]:
new_era = 2017

Don't mind the following, as it is mind blowing!

In [None]:
years = count_df["year"].to_list()
sets = count_df["parent_theme"].to_list()
count = count_df["Count"].to_list()
listed = list(zip(years, sets, count))

[max((g for g in listed if g[0] == year), key=lambda k: k[2]) for year in list({g[0] for g in listed})]

# breaking down
# list(set([g[0] for g in listed])) -> ceate a new (shorter) list of the the o.g. list without duplicates
# g for g in listed if g[0] == year -> loop over the o.g. list and create a new one with values matching the above list
# max([g for g in listed if g[0] == year], key=lambda k: k[2]) -> returm the max value of that new list accordingly to the argument (key)
print(listed)

In [None]:
# def sorter(dataset):
#     output = []
#     all_themes = dict()
#     for year, theme, count in dataset:
#         if year not in all_themes:
#             all_themes[year] = []
#         all_themes[year].append((theme, count))
#     for year in all_themes:
#         biggest = 1997, "theme", 0
#         for theme, count in all_themes[year]:
#             while count > biggest[-1]:
#                 biggest = year, theme, count
#         output.append(biggest)
#     return output

In [None]:
# from itertools import groupby
# from operator import itemgetter

# res = [max(group, key=itemgetter(2)) for _, group in groupby(listed, itemgetter(0))]
# print(res)

# # or

# res = []
# for _, group in groupby(listed, itemgetter(0)):
#     res.append(max(group, key=itemgetter(2)))
# print(res)

In [None]:
dict_listed = {}
for item in listed:
    d_key = item[0]
    if d_key not in dict_listed:
        dict_listed[d_key] = []
    dict_listed[d_key].append(item)
resultat = [max(v, key=lambda x: x[2]) for v in dict_listed.values()]
print(resultat)

Some additional analysis...

#### Question: How many sets produced by year? In which year did they produce the most?

Remember our `merged` DF? Well that's what we gonna use.

In [None]:
# first let's create a `count` column
merged["count"] = 1

In [None]:
all_sets_per_year = (
    merged.groupby(["year"]).sum().reset_index()[["year", "count"]]
)
# for idx, row in all_sets_per_year.iterrows():
#     print(f'{row["year"]} -> {row["count"]}')
all_sets_per_year

In [None]:
years = [year for year, _ in merged.groupby("year")]
counts = [row["count"] for _, row in (all_sets_per_year.iterrows())]
print(years)

In [None]:
plt.figure(figsize=(12, 5))
plt.style.use("bmh")

plt.plot(years, counts, "bx-")
plt.title("LEGO sets released by year (1950-2017)", pad=10)
plt.xticks([year for year in years if year % 3 == 0] + [2017], rotation=90)

plt.show()

In [None]:
plt.figure(figsize=(12, 5))
plt.style.use("seaborn-white")
plt.bar(years, counts)
plt.show()