In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<img src="https://i.imgur.com/49FNOHj.jpg">

### As part of the DS Coding class, this analysis will answer the following questions:

What is the most enormous LEGO set ever created and how many parts did it have?

In which year were the first LEGO sets released and how many sets did the company sell when it first launched?

Which LEGO theme has the most sets? Is it Harry Potter, Ninjago, Friends or something else?

When did the LEGO company really take-off based on its product offering? How many themes and sets did it release every year?

Did LEGO sets grow in size and complexity over time? Do older LEGO sets tend to have more or fewer parts than newer sets?



Data from public lego-database

# Data Source
ReBrickable has compiled data on legos

Importing Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#pd.set_option('display.max_rows')
color_df = pd.read_csv('/kaggle/input/lego-database/colors.csv')
themes_df = pd.read_csv('/kaggle/input/examplelegodata/themes.csv')

In [None]:
color_df.head()

In [None]:
color_df.shape

In [None]:
color_df.columns

In [None]:
uniquecolors = color_df['name'].nunique()
print(uniquecolors)

In [None]:
color_df.groupby('is_trans').count()

In [None]:
color_df.is_trans.value_counts()

In [None]:
sets_df = pd.read_csv('/kaggle/input/examplelegodata/sets.csv')

In [None]:
sets_df.shape

In [None]:
sets_df.columns

In [None]:
sets_df['year'].idxmin()

In [None]:
sets_df.iloc[9521]

Which year did Lego start production and what was the product?

In [None]:
sets_df.sort_values('year').head()

How many products in production in first year?

In [None]:
sets_df[sets_df['year'] == 1949]

Which product has maximum number of parts?

In [None]:
sets_df.sort_values('num_parts', ascending=False).head()

To count the number of sets produced each year, group by year and count()

In [None]:
sets_by_year = sets_df.groupby('year').count()

In [None]:
sets_by_year['set_num'].head()

In [None]:
plt.plot(sets_by_year.index, sets_by_year.set_num, 
             linewidth=3)

We take off data point from 2020 and 2021 since its not complete data

Slice 2 data points off from the year and the set_num data

In [None]:
plt.plot(sets_by_year.index[:-2], sets_by_year.set_num[:-2], 
             linewidth=3, label=sets_by_year.set_num)

In [None]:
sets_df.head()

In [None]:
sets_df.tail()

## To find out how many themes per year we aggregate unique theme per year

In [None]:
themes_per_year = sets_df.groupby('year').agg({'theme_id':pd.Series.nunique})

In [None]:
themes_per_year.head()

In [None]:
themes_per_year.tail()

## Replace column name 'theme_id' with 'num_themes'

In [None]:
themes_per_year.rename(columns = {'theme_id':'num_theme'}, inplace=True)

In [None]:
themes_per_year.head()

In [None]:
themes_per_year.tail()

In [None]:
plt.figure(figsize=(12,8))
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylim(0, 100)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Themes', fontsize=14)
plt.plot(themes_per_year.index[:-1], themes_per_year.num_theme[:-1], 
             linewidth=3, label=themes_per_year.num_theme)

## Plotting number of sets per year and number of themes per year on same graph

In [None]:

plt.plot(themes_per_year.index[:-1], themes_per_year.num_theme[:-1])
plt.plot(sets_by_year.index[:-1], sets_by_year.set_num[:-1])

In [None]:
plt.figure(figsize=(12,8))
ax1 = plt.gca() #get current axis
ax2 = plt.twinx() # copy the axis that shares the same x axis
ax1.plot(themes_per_year.index[:-1], themes_per_year.num_theme[:-1], color='blue')
ax2.plot(sets_by_year.index[:-1], sets_by_year.set_num[:-1], color='red')
ax1.set_xlabel('Date', fontsize=14)
ax1.set_ylabel('Number of Themes', color='blue', fontsize=14)
ax2.set_ylabel('Number of sets', color='red', fontsize=14)

Average number of parts used by Lego per year and how it has changed

In [None]:
Avg_num_parts = sets_df.groupby('year').agg({'num_parts':pd.Series.mean})
Avg_num_parts.head()

In [None]:
Avg_num_parts.tail()

Creating a scatter plot to visualize Ang_num_part dataframe

In [None]:
plt.scatter(Avg_num_parts.index[:-1], Avg_num_parts.num_parts[:-1])

Display the database schema (link: https://i.imgur.com/Sg4lcjx.png) inside the Notebook.

<img src="https://i.imgur.com/Sg4lcjx.png" alt="Display Database Schema">

In [None]:
themes_df.head()

In [None]:
themes_df.tail()

In [None]:
themes_per_year.head()

In [None]:
themes_per_year.shape

In [None]:
themes_df[themes_df.name == 'X-Men']

In [None]:
sets_df[sets_df.theme_id == 707]

In [None]:
sets_df.tail()

In [None]:
set_theme_count = sets_df['theme_id'].value_counts()
set_theme_count[:5]

In [None]:
set_theme_count.head()

Convert the series, set_theme_count into a dataframe

In [None]:
set_theme_count = pd.DataFrame({'id':set_theme_count.index, 'set_count':set_theme_count.values})

In [None]:
themes_df.head()

Merging DataFrames Themes_df and set_theme_count on id

In [None]:
merged_df = pd.merge(themes_df, set_theme_count, on='id')
merged_df.head()

In [None]:
merged_df[merged_df.name=='Star Wars']

Sort the Dataframe into new df sorted by number of sets

In [None]:
merged_sum_df = merged_df.sort_values(by='set_count', ascending=False)
merged_sum_df.head()

####As can be seen from chart below, Lego has diversified into many products such as key chains, gears etc that also include school bags. Would be interesting to see if adding these non lego items dilutes the brand or increases revenue.

In [None]:
plt.figure(figsize=(12,8))
plt.bar(merged_sum_df.name[:10], merged_sum_df.set_count[:10])
plt.xticks(fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.xlabel('Names of Themes', fontsize=14)
plt.ylabel('Number of Sets', fontsize=14)
plt.title('Top ten themes as per number of sets', fontsize=16)