In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Intro to Data Visualization
- In General, four types (Comparison, Distribution, Relationship, Composition) are suggested to visualize data. 
- Here, Using [plotly](https://plotly.com/), some important graphs will be shared. I hope you enjoying joining this competition. 
- Graph Reference: https://www.tapclicks.com/wp-content/uploads/How-to-Visualize-your-Data-with-Charts-and-Graphs.jpg


![](https://www.tapclicks.com/wp-content/uploads/How-to-Visualize-your-Data-with-Charts-and-Graphs.jpg)

## Understanding Data Types
See below. 

| Scale                    	| Nominal 	| Ordinal                 	| Interval      	| Ratio    	|
|--------------------------	|---------	|-------------------------	|---------------	|----------	|
| example                  	| "Color" 	| "Level of Satisfaction" 	| "Temperature" 	| "Height" 	|
| Labeled                  	| Yes     	| Yes                     	| Yes           	| Yes      	|
| Meaningful Order         	| No      	| Yes                     	| Yes           	| Yes      	|
| Measurable Differences   	| No      	| No                      	| Yes           	| Yes      	|
| True Zero Starting Point 	| No      	| No                      	| No            	|    Yes   	|


## Data Imoprt & Preparation
- dataset consists of questions and answers. 
- The questions also is composed of two questions - main & supplementary. 

In [None]:
import pandas as pd
df = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
df.head()

- 0th row is better to be removed and divided into other dataset, called questions. 

In [None]:
questions = df.iloc[0, :].T
questions

In [None]:
df = df.iloc[1:, :].reset_index(drop = True)
df.head()

In [None]:
df['Q25'].value_counts()

## Plotly Structure
- Before you dive in plotly ecosystem, you must read this article [Graph Objects in Python](https://plotly.com/python/graph-objects/)
- To make it clear, the way plotly express depends on graph objects, which all plotly express figures all return instances of `plotly.graph_objects.Figure` and has a limitation of creating [subplots of different types](https://plotly.com/python/mixed-subplots/), [dual-axis plots](https://plotly.com/python/multiple-axes/), [faceted plots](https://plotly.com/python/facet-plots/).  
    + it seems to me that the relationship between graph objects and plotly express is similar with it between matplotlib and seaborn, isn't it? 
- So, if you are newbie, I guess you drop the way plotly express down and learn the way to build graph objects. 
- We will see how different it is in coding. 

### Plotly Express
- The way to code is somehow similar with [seaborn](https://seaborn.pydata.org/)

In [None]:
temp = pd.DataFrame({
  "Fruit": ["Apples", "Oranges", "Bananas", "Apples", "Oranges", "Bananas"],
  "Contestant": ["Alex", "Alex", "Alex", "Jordan", "Jordan", "Jordan"],
  "Number Eaten": [2, 1, 3, 1, 3, 2],
})

temp

In [None]:
# plotly express
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import plotly.express as px
fig = px.bar(temp, x = "Fruit", y = "Number Eaten", color = "Contestant", barmode = "group") # if barmode is not set, then it will be stacked. just check. 
fig.show()

In [None]:
import plotly.express as px

long_df = px.data.medals_long()
print(long_df)

In [None]:
import plotly.express as px

long_df = px.data.medals_long()

fig = px.bar(long_df, x="nation", y="count", color="medal", title="Long-Form Input")
fig.show()

### Graph Objects
- It needs to instance go.Figure() class before visualizing. 

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Bar(name = "Alex", x = temp['Fruit'], y = temp[temp['Contestant'] == "Alex"]['Number Eaten'].values))
fig.add_trace(go.Bar(name = "Jordan", x = temp['Fruit'], y = temp[temp['Contestant'] == "Jordan"]['Number Eaten'].values))
fig.update_layout(barmode='group')
fig.show()

- But, it seems to be difficult to code and readability in coding is not good as well. 
- So, let's add for-loop inside `fig.trace()`

In [None]:
for contestant, group in temp.groupby("Contestant"):
    print("contestant:", contestant, "\n")
    print("group:\n", group, "\n")

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
for contestant, group in temp.groupby("Contestant"):
   fig.add_trace(go.Bar(x = group['Fruit'], y = group['Number Eaten'], name = contestant))
fig.update_layout(barmode='group')
fig.show()

## Figure Object
- Plotly supports more than about 50 types of charts, providing 2D and 3D visualizations, ternary plots, maps, etc. 
- We need to focus `Figure` Objects here. 
    + It mainly has two main sub-components - data & layout. 


In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.show("json")

### Data & layout
- Data
    + it includes data, literally, x-values, y-values, circles, rectangles, lines, and etc. 
    +  The values what to put is dependent up on the chart type. 
        + for scatter plot, it needs x and y values
        + for map chart, it needs lat and lon values. 
- Layout
    + All things belongs to this attribute except for data-relevant attributes. 
    + Many are related with styling elements, such as font size, location, and more. 
- If you want to know more about structure, then just code `fig.show("json")' although you can't see now, you may see the result in your kaggle notebook. just try it. 

In [None]:
import plotly.graph_objects as go 
fig = go.Figure()
fig.add_trace(go.Scatter(x = [1, 2, 3], y = [4, 5, 2]))
fig.show("json")

## Review Dataset
- As you look at kaggle datasets, all dataset is just nominal dataset. 
- in this case, the main skill that understand and extract some insights is a creability that makes the count-based pivot table.
- In this case, simply [value_counts() in pandas library](https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html) will be frequently used. 

In [None]:
df['Q1'].value_counts()

## What to Visualize
- If you clearly read [that image](https://www.tapclicks.com/wp-content/uploads/How-to-Visualize-your-Data-with-Charts-and-Graphs.jpg) above, the main purpose of visualization with this dataset would be in both comparison and composition. It's difficult to visualize the `Relationship` & `Distribution`. 

### Bar Graph
- Let's Create Bar Graph. 


In [None]:
q1_df = df['Q1'].value_counts()

fig = go.Figure()
fig.add_trace(go.Bar(x = q1_df.index, y = q1_df.values))

fig.show()

#### Styling Changes
- When you feel to change layout, then just add [update_layout()](https://plotly.com/python/creating-and-updating-figures/).
- Here, just add title, axis title, category order, specially customized manner.


In [None]:
q1_df = df['Q1'].value_counts()

CATEGORY_ORDER = ["18-21", "22-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-69", "70+"]

# basic graph
fig = go.Figure()
fig.add_trace(go.Bar(x = q1_df.index, y = q1_df.values))

# styling changes
fig.update_layout(plot_bgcolor = "white", 
                 font  = dict(color = "#909999"), 
                 title = dict(text = "your TITLE text"), 
                 xaxis = dict(title = "your X-AXIS TITLE", linecolor = "#21DBAA", categoryorder = "array", categoryarray = CATEGORY_ORDER), 
                 yaxis = dict(title = "your Y-AXIS TITLE", linecolor = "#DB9021"))

fig.show()

#### Group Bar Chart
- Now, let's add gender metric to this graph.
- But, We need to re-bining into three groups, since the numbers of `Prefer not to say, Nonbinary, Prefer to self-describe` is relatively small. 


In [None]:
q1_q2_df = df.loc[:, ["Q1", "Q2"]].replace({'Prefer not to say':'etc', 'Nonbinary':"etc", "Prefer to self-describe": "etc"})
q1_q2_df['Q2'].value_counts()

- This code is to count Q2-Q1 groupbed table.
- Now, we need number 
- This is important step to prepare new dataset before visualizing bar chart, because the dataset is usually made up with two categorical variables. 
- Now, let's add new column,called "Count". 

In [None]:
q1_q2_df = q1_q2_df.groupby(['Q2','Q1']).size().reset_index().rename(columns = {0:"Count"})
q1_q2_df.head()

- Now, we need a graph using for-loop like before.
- It's very easy to draw. 

In [None]:
fig = go.Figure()
for gender, group in q1_q2_df.groupby("Q2"):
   fig.add_trace(go.Bar(x = group['Q1'], y = group['Count'], name = gender))
fig.update_layout(barmode="group", 
                 plot_bgcolor = "white")
fig.show()

## Ratio Graph
- Bar Graph is good when readers just look at only numbers. 
- But, it's difficult to compare, appropriately, between two countries from the differenct social backgrounds. 
- Then, Ratio might be critical in this step. 
- Let's look at Q3

In [None]:
q3_df = df['Q3'].value_counts()
fig = go.Figure()
fig.add_trace(go.Bar(x = q3_df.index, y = q3_df.values))
fig.show()

- India outnumbers over any other countries.
- If you compare any other country with India, the number of India is over the others in most cases. 

### Dataset 
- We need new dataset. 
- Replace() is great to re-group in this case. 
- It depends on how you customizes Q25. 

In [None]:
q3_q25 = df[['Q3', 'Q25']]
q3_q25['Q25'].replace(['$0-999', '1,000-1,999'], '$0-1,999', inplace = True)
q3_q25['Q25'].replace(['2,000-2,999', '3,000-3,999'], '$2,000-3,999', inplace = True)
q3_q25['Q25'].replace(['2,000-2,999', '3,000-3,999'], '$2,000-3,999', inplace = True)
q3_q25['Q25'].replace(['4,000-4,999', '5,000-7,499'], '$4,000-7,499', inplace = True)
q3_q25['Q25'].replace(['25,000-29,999', '60,000-69,999',  
                       '30,000-39,999','15,000-19,999', '70,000-79,999', 
                       '10,000-14,999', '20,000-24,999', '7,500-9,999', 
                       '100,000-124,999', '40,000-49,999', '50,000-59,999', 
                       '300,000-499,999', '200,000-249,999', '125,000-149,999', 
                       '250,000-299,999', '80,000-89,999', '90,000-99,999', 
                       '150,000-199,999', '>$1,000,000', '$500,000-999,999'], '$7,500+', inplace = True)

- Many missing values exist. So, I just delete it. 

In [None]:
q3_q25.dropna(subset = ["Q25"], inplace=True)

- This code calculates count and percentage country by country
- Here, just select two countries India and USA. 

In [None]:
q3_q25 = q3_q25.groupby(['Q3','Q25']).size().reset_index().rename(columns = {0:"Count"})

# India
india_df = q3_q25[q3_q25['Q3'] == "India"].reset_index(drop = True)
india_df['percentage'] = india_df["Count"] / india_df["Count"].sum()
india_df.head()

In [None]:
# USA
usa_df = q3_q25[q3_q25['Q3'] == "United States of America"].reset_index(drop = True)
usa_df['percentage'] = usa_df["Count"] / usa_df["Count"].sum()
usa_df.head()

- Add new column "%", but we will not add "%" here but will add on plotly figure. 

In [None]:
india_df['%'] = np.round(india_df['percentage'] * 100, 1)
usa_df['%'] = np.round(usa_df['percentage'] * 100, 1)

india_usa_df = pd.concat([india_df, usa_df]).reset_index()
india_usa_df

- The code is similar as it was but add two parameters - text and textposition. 

In [None]:
fig = go.Figure()
for country, group in india_usa_df.groupby("Q3"):
   fig.add_trace(go.Bar(x = group['Q25'], 
                        y = group['%'], 
                        name = country, 
                        text = group['%'].astype(str) + "%", 
                        textposition='auto'))
fig.update_layout(barmode="group", 
                  plot_bgcolor = "white")
fig.show()

## Bar Chart with Programming Percent Year
- Load all datasets, 2019-2021. 

In [None]:
df_2021 = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
df_2020 = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")
df_2019 = pd.read_csv("../input/kaggle-survey-2019/multiple_choice_responses.csv")

- The number of some questions are being varied by year by year. 
- For example, the question as to programming in 2019 have differed from other years such as 2020 & 2021. 
    + So, we need to extract first each programming.
    + And in each year, the types of languages are a bit different. 
    + The most relevant language in Data Science is Python, R, SQL, Julia, C, C++. 

In [None]:
print("2019:", df_2019['Q19'].unique().tolist())
print("2020:", df_2020['Q8'].unique().tolist())
print("2021:", df_2021['Q8'].unique().tolist())

In [None]:
programming_list = ["Python", "R", "SQL", "Java", "C", "Bash", "Javascript", "C++"]
programming_df = pd.Series(programming_list)

df_2019 = df_2019[df_2019['Q19'].isin(programming_df)]
df_2020 = df_2020[df_2020['Q8'].isin(programming_df)]
df_2021 = df_2021[df_2021['Q8'].isin(programming_df)]

print("2019:", df_2019['Q19'].unique().tolist())
print("2020:", df_2020['Q8'].unique().tolist())
print("2021:", df_2021['Q8'].unique().tolist())

- Good, it becomes much clear. 
- Now we will select some other questions, country, programmings, and Job. 

In [None]:
q3_q5_q19_2019 = df_2019.loc[:, ['Q3', 'Q5', 'Q19']]
q3_q5_q19_2019 = q3_q5_q19_2019.rename(columns = {'Q19': 'Q8'}, inplace = False) # To match with other datasets
q3_q5_q8_2020 = df_2020.loc[:, ['Q3', 'Q5', 'Q8']]
q3_q5_q8_2021 = df_2021.loc[:, ['Q3', 'Q5', 'Q8']]

q3_q5_q19_2019.shape, q3_q5_q8_2020.shape, q3_q5_q8_2021.shape

- Let's add year feature to differentiate each column. 

In [None]:
q3_q5_q19_2019['year'] = '2019'
q3_q5_q8_2020['year'] = '2020'
q3_q5_q8_2021['year'] = '2021'

q3_q5_q19_2019.shape, q3_q5_q8_2020.shape, q3_q5_q8_2021.shape

- And finally we will all combined into one dataset. 
- This dataset is just an example, so you can create more new dataset with other questions robustly. 

In [None]:
final_df = pd.concat([q3_q5_q19_2019, q3_q5_q8_2020, q3_q5_q8_2021])
final_df.head()

- Now, it's time to calculate count-ratio and save the calculated object differently. 
- My intention here is to draw multiple bar graphs from different datasets. 

In [None]:
year_q5_q8 = final_df.groupby(['year', 'Q8']).size().reset_index().rename(columns = {0:"Count"})

# 2019
q8_2019 = year_q5_q8[year_q5_q8['year'] == "2019"].reset_index(drop = True)
q8_2019['percentage'] = q8_2019["Count"] / q8_2019["Count"].sum()
q8_2019['%'] = np.round(q8_2019['percentage'] * 100, 1)

# 2020
q8_2020 = year_q5_q8[year_q5_q8['year'] == "2020"].reset_index(drop = True)
q8_2020['percentage'] = q8_2020["Count"] / q8_2020["Count"].sum()
q8_2020['%'] = np.round(q8_2020['percentage'] * 100, 1)

# 2021
q8_2021 = year_q5_q8[year_q5_q8['year'] == "2021"].reset_index(drop = True)
q8_2021['percentage'] = q8_2021["Count"] / q8_2021["Count"].sum()
q8_2021['%'] = np.round(q8_2021['percentage'] * 100, 1)

- Now, just add graphs and will show you bar graphs with different years. 

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x = q8_2019['Q8'], 
                     y = q8_2019['%'], 
                     name = "2019", 
                     text = q8_2019['%'].astype(str) + "%", 
                     textposition='auto'))

fig.add_trace(go.Bar(x = q8_2020['Q8'], 
                     y = q8_2020['%'], 
                     name = "2020", 
                     text = q8_2020['%'].astype(str) + "%", 
                     textposition='auto'))

fig.add_trace(go.Bar(x = q8_2021['Q8'], 
                     y = q8_2021['%'], 
                     name = "2021", 
                     text = q8_2021['%'].astype(str) + "%", 
                     textposition='auto'))

fig.show()

## Heatmap Graph
- Heatmap is a two-dimensional visual representation of data. This chart is encoded in colors and values, delivering new insight of information. 
- In most cases, Heatmap shows count in each box. 
- It's difficult to draw heatmap with annotation when using ONLY `graph_objects`, in most cases you must use for-loop. 
- But, Plotly already provides [figure factory](https://plotly.com/python/figure-factories/)

In [None]:
import plotly.figure_factory as ff

z=[[1, 90, 30, 50, 1], [20, 1, 60, 80, 30], [30, 60, 1, 50, 20]]
x=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
y=['Morning', 'Afternoon', 'Evening']

fig = ff.create_annotated_heatmap(z, x = x, y = y, colorscale = "Viridis")
fig.show()

- If you want to draw heatmap ONLY using `graph_objects`, then you are able to code like below. 
- But, for newbie, I would not recommend this code below. 

In [None]:
import plotly.graph_objects as go
from functools import reduce
from itertools import product

z=[[1, 90, 30, 50, 1], [20, 1, 60, 80, 30], [30, 60, 1, 50, 20]]
x=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
y=['Morning', 'Afternoon', 'Evening']

def get_anno_text(z_value):
    annotations=[]
    a, b = len(z_value), len(z_value[0])
    flat_z = reduce(lambda x,y: x+y, z_value) # z_value.flat if you deal with numpy
    coords = product(range(a), range(b))
    for pos, elem in zip(coords, flat_z):
        annotations.append({'font': {'color': '#FFFFFF'},
                    'showarrow': False,
                    'text': str(elem),
                    'x': pos[1],
                    'y': pos[0]})
    return annotations

fig = go.Figure(data=go.Heatmap(
                   z=z,
                   x=x,
                   y=y,
                   hoverongaps = True))

fig.update_layout(annotations = get_anno_text(z))
fig.show()

- Now, we now how to create heatmap. But, We want to know correlation between two variables. How? 
- Let's Select Questions Q4 & Q1

In [None]:
df.head()

- Now, will transform into all counted dataset, filling missing values with 0

In [None]:
df.groupby(['Q4', 'Q1']).size().unstack().fillna(0).astype("int16")

- Here the point is the code below. It is possible to convert dataframe to correlation matrix, saved as z_data

```python
z.apply(lambda x:np.round(x/x.sum(), 2), axis = 1).to_numpy()
```

- x and y value must be list type. 



In [None]:
import plotly.graph_objects as go
import plotly.figure_factory as ff

z = df.groupby(['Q4', 'Q1']).size().unstack().fillna(0).astype('int64')
z_data = z.apply(lambda x:np.round(x/x.sum(), 2), axis = 1).to_numpy() # convert to correlation matrix
x = z.columns.tolist()
y = z.index.tolist()

fig = ff.create_annotated_heatmap(z_data, x = x, y = y, colorscale = "Viridis")
fig.show()

## Multiple Plots

In [None]:
import pandas as pd
df21 = pd.read_csv("../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv")
questions = df21.iloc[0, :].T
df21 = df21.iloc[1:, :]
df21.head()

- Lects check two questions. It's about Cloud Service
    + one is about developer's satisfaction, other for daily usage. 

In [None]:
print(questions['Q28'])
print(questions['Q29_A_Part_1'])

- Now, one problem exists. Some supplementary questions must be combined into one dataset. So, let's create function. 
    + Main Reference is here: https://www.kaggle.com/ruchi798/kaggle-ml-ds-survey-analysis
    + Just add some if_condition.

In [None]:
def sub_questions_count(question_num, part_num, text = False):
  part_questions = []

  if text in ["A", "B"]:
    part_questions = ['Q' + str(question_num) + "_" + text + '_Part_' + str(j) for j in range(1, part_num)]
    part_questions.append('Q' + str(question_num) + "_" + text + '_OTHER')
  else:
    part_questions = ['Q' + str(question_num) + '_Part_' + str(j) for j in range(1, part_num)]
    part_questions.append('Q' + str(question_num) + '_OTHER')

  # category count
  categories = []
  counts = []
  for i in part_questions:
    category = df[i].value_counts().index[0]
    val = df[i].value_counts()[0]
    categories.append(category)
    counts.append(val)

  combined_df = pd.DataFrame()
  combined_df['Category'] = categories
  combined_df['Count'] = counts

  combined_df = combined_df.sort_values(['Count'], ascending = False)
  return combined_df

- Let's Test
- If you look at it, the number of cloud users are not much compared to the nunber of responses (N=25973). 

In [None]:
sub_questions_count(29, 4, "A")

- Now, there are Many Clouds system. But, AWS, GCP, and Azures are most used. 

In [None]:
df21['Q28'].value_counts()

- Now, will make two plots with two questions differently. 
- And, bar graph will be used in this plot.
- As usual, it's always necessary to check tutorial and docs. 
    + Ref. https://plotly.com/python/subplots/
- It's very easy to build two graphs when using just `make_subplots()`

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

x1 = [1, 2, 3]
y1 = [4, 5, 6]
x2 = [10, 20, 30]
y2 = [50, 60, 70]

fig = make_subplots(rows = 1, cols = 2, 
                    subplot_titles=("Plot 1", "Plot 2"), # title of each graph area
                    column_widths = [0.7, 0.3]) # size control  
fig.add_trace(
    go.Scatter(x = x1, y = y1), # Graph Area
    row=1, col = 1              # layout area
)

fig.add_trace(
    go.Scatter(x = x1, y = y1), # Graph Area
    row=1, col=2              # layout area
)

# Graph Option Area
fig.update_layout(height=500, width=700,
                  title_text="Multiple Subplots with Titles") # Whole Graph Title Area

fig.show()

- Now, Will compare four countries
    + China, Japan, South Korea, and USA. 
- Let's pick four countries
- Will use Q5, Many Job titles is contained but select only a few title. 

In [None]:
# containing data related to Data Science Field Only 
df21 = df21[df21['Q5'].isin(['Student','Data Scientist','Software Engineer', 'Data Analyst', 'Machine Learning Engineer','Research Scientist'])]
df21['Q5'].value_counts()

- Let's check data shape if new data is correctly created.
- And Pick four countries

In [None]:
df21['Q3'] = df21['Q3'].replace(['United States of America', 'South Korea'], ['USA', 'Korea']) # Change Name
country_list = ["USA", "China", "Japan", "Korea"]
countries_df = df21[df21['Q3'].isin(country_list)]
countries_df.shape

- Now, the country is fixed but other questions can be varied. 
- To draw, bar graph, we need count & percentage. 

In [None]:
q3_q28 = countries_df.groupby(['Q3', 'Q28']).size().reset_index().rename(columns = {0:"Count"})

def get_pnt(data, country):
    data_country = data[data['Q3'] == country].reset_index(drop = True)
    data_country['percentage'] = data_country["Count"] / data_country["Count"].sum()
    data_country['%'] = np.round(data_country['percentage'] * 100, 1)
    
    return data_country

usa_df = get_pnt(q3_q28, "USA")
china_df = get_pnt(q3_q28, "China")
japan_df = get_pnt(q3_q28, "Japan")
korea_df = get_pnt(q3_q28, "Korea")

- All comments, how to code, are written. But Docs are most important resource. Always check it out. 
    + Long text in your graph must be shorten, this task remains for you :P
- Analyzing is up to you. No Comment here. 
- It could be possible to draw more if you add year by year. 

In [None]:
import plotly.graph_objects as go
import plotly.figure_factory as ff

fig = make_subplots(rows = 2, cols = 2, 
                    shared_xaxes=True, # Shared X Axes
                    shared_yaxes=True, # Shared Y Axes
                    vertical_spacing = 0.05,
                    subplot_titles=("USA with Q28", "China with Q28", "Japan with Q28", "Korea with Q28"), # title of each graph area
                    column_widths = [0.5, 0.5]) # size control  

fig.add_trace(go.Bar(x = usa_df['Q28'], 
                     y = usa_df['%'], 
                     text = usa_df['%'].astype(str) + "%", 
                     textposition='auto'), 
              row = 1, col = 1)

fig.add_trace(go.Bar(x = china_df['Q28'], 
                     y = china_df['%'], 
                     text = china_df['%'].astype(str) + "%", 
                     textposition='auto'), 
              row = 1, col = 2)

fig.add_trace(go.Bar(x = japan_df['Q28'], 
                     y = japan_df['%'], 
                     text = japan_df['%'].astype(str) + "%", 
                     textposition='auto'), 
              row = 2, col = 1)

fig.add_trace(go.Bar(x = korea_df['Q28'], 
                     y = korea_df['%'], 
                     text = korea_df['%'].astype(str) + "%", 
                     textposition='auto'), 
              row = 2, col = 2)

fig.update_layout(height = 1000,
                  showlegend=False)

fig.show()

- Let's make heatmap using subplots
- But, For-loop must be written to annote text on graph. 
- It's same logic as it was to create heatmap before. 
- If you want to create heatmap with more countries, then you should make user-defined function and need to use for-loop. This task is up to you. 
    + Here, just hard-coding. 

In [None]:
kor_corr_df = countries_df[countries_df['Q3'] == "Korea"].reset_index(drop = True)
kor_z = kor_corr_df.groupby(['Q1', 'Q4']).size().unstack().fillna(0).astype('int64')
kor_z_data = kor_z.apply(lambda x:np.round(x/x.sum(), 2), axis = 1).to_numpy() # convert to correlation matrix
kor_x = kor_z.columns.tolist()
kor_y = kor_z.index.tolist()

jap_corr_df = countries_df[countries_df['Q3'] == "Japan"].reset_index(drop = True)
jap_z = jap_corr_df.groupby(['Q1', 'Q4']).size().unstack().fillna(0).astype('int64')
jap_z_data = jap_z.apply(lambda x:np.round(x/x.sum(), 2), axis = 1).to_numpy() # convert to correlation matrix
jap_x = jap_z.columns.tolist()
jap_y = jap_z.index.tolist()

fig1 = ff.create_annotated_heatmap(kor_z_data, x=kor_x, y=kor_y)
fig2 = ff.create_annotated_heatmap(jap_z_data, x=jap_x, y=jap_y)
for annot in fig2['layout']['annotations']:
    annot['yref'] = 'y2' # rows if cols > 1, then annot['xref'] = 'x2' 
fig = make_subplots(rows=2, cols=1, 
                   subplot_titles=("Korea", "Japan"),
                   vertical_spacing = 0.03,
                   shared_xaxes=True, 
                   shared_yaxes=True) # Shared Y Axes
fig.add_trace(fig1.data[0], row=1, col=1)
fig.add_trace(fig2.data[0], row=2, col=1)
fig.update_layout(fig1.layout, height = 1000)
fig.layout.annotations += fig2.layout.annotations
fig.show()

## Grouped Bar Chart with Multiple Axes
- Code Explanation will be continued
- References: https://plotly.com/python-api-reference/generated/plotly.graph_objects.Scatter.html
> mode – Determines the drawing mode for this scatter trace. If the provided mode includes “text” then the text elements appear at the coordinates. Otherwise, the text elements appear on hover. If there are less than 20 points and the trace is not stacked then the default is “lines+markers”. Otherwise, “lines”.

In [None]:
char_text = ["A group", "B group", "C group"]

fig = go.Figure()
fig.add_trace(
    go.Bar(
           name = "A company",
           x = char_text, 
           y = [30, 15, 20], 
           yaxis = "y1"))

fig.add_trace(
    go.Scatter(
           name = "B company",
           x = char_text, 
           y = [200, 300, 210], 
           mode = 'lines+markers', # please check option here
           yaxis = "y2"))

fig.update_layout(yaxis  = dict(title = "A Company", showgrid = False),
                  yaxis2 = dict(title = "B Company", overlaying = "y", side = "right", showgrid = False), 
                  template = "plotly_white")

fig.show()


- First, we need a pivot-table about Q28 ~ Q1.

In [None]:
pivot_q1_q28 = df21.groupby(['Q1', 'Q28']).size().unstack().fillna(0).astype("int16")
pivot_q1_q28

- Now, it's time to get each numeric value to input graph. 
    + But, must check column value, which exists whitespace. Could remove it if you need but I don't do it here. 

In [None]:
pivot_q1_q28.columns

- Here add two graphs, dual axes graph. 
    + One for Bar graph y
- To draw Line Graph, it suggests to use `go.Scatter(~, mode = "Line"|"lines+markers"|"markers")`
- layout option, the key is to code range. if you delete it, then the location of zeroline in axis would be different between left axis for Bar Graph and right axis for Line Graph. 

In [None]:
pivot_q1_q28 = df21.groupby(['Q1', 'Q28']).size().unstack().fillna(0).astype("int16")
aws_num = pivot_q1_q28[' Amazon Web Services (AWS) '].values.tolist()
azure_num = pivot_q1_q28[' Microsoft Azure '].values.tolist()
aga_category = pivot_q1_q28.index.tolist()

fig = go.Figure()

fig.add_trace(
    go.Bar(
           name = "Amazon Web Services (AWS)",
           x = aga_category, 
           y = aws_num, 
           yaxis = "y1"))

fig.add_trace(
    go.Scatter(
           name = "Microsoft Azure",
           x = aga_category, 
           y = azure_num, 
           mode = 'lines+markers', # please check option here
           yaxis = "y2"))

fig.update_layout(yaxis  = dict(title = "Amazon Web Services (AWS)", showgrid = False),
                  yaxis2 = dict(title = "Microsoft Azure", overlaying = "y1", side = "right", showgrid = False, 
                                zeroline = False), 
                                # range=[min(azure_num), max(azure_num)]), # This code solves the different zero set but with same zero values.
                  template = "plotly_white")

fig.show()

In [None]:
pivot_q1_q28 = df21.groupby(['Q1', 'Q28']).size().unstack().fillna(0).astype("int16")
aws_num = pivot_q1_q28[' Amazon Web Services (AWS) '].values.tolist()
azure_num = pivot_q1_q28[' Microsoft Azure '].values.tolist()
aga_category = pivot_q1_q28.index.tolist()

fig = go.Figure()
fig.add_trace(
    go.Bar(
           name = "Amazon Web Services (AWS)",
           x = aga_category, 
           y = aws_num, 
           yaxis = "y1"))

fig.add_trace(
    go.Scatter(
           name = "Microsoft Azure",
           x = aga_category, 
           y = azure_num, 
           mode = 'lines+markers', # please check option here
           yaxis = "y2"))

fig.update_layout(yaxis  = dict(title = "Amazon Web Services (AWS)", showgrid = False),
                  yaxis2 = dict(title = "Microsoft Azure", overlaying = "y1", side = "right", showgrid = False, 
                                zeroline = False, 
                                range=[min(azure_num), max(azure_num)]), # This code solves the different zero set but with same zero values.
                  template = "plotly_white")


fig.show()

## Stacked Area Cart
- A 100% stacked area chart shows how the constituent parts of a whole have changed over time or group.
- The stackgroup parameter is used to add the y values of the different traces in the same group. Traces in the same group fill up to the next trace of the group.
- Lets create Sample Chart at First. 

In [None]:
x = ["2017", "2018", "2019"]
y_2017 = [40, 60, 40, 10]
y_2018 = [20, 10, 10, 60]
y_2019 = [40, 30, 50, 30]

fig = go.Figure()
fig.add_trace(go.Scatter(
    x = x, 
    y = y_2017, 
    mode = "lines", 
    line = dict(width = 0.5),
    stackgroup = "one"
))

fig.add_trace(go.Scatter(
    x = x, 
    y = y_2018, 
    mode = "lines", 
    line = dict(width = 0.5),
    stackgroup = "one"
))

fig.add_trace(go.Scatter(
    x = x, 
    y = y_2019, 
    mode = "lines", 
    line = dict(width = 0.5),
    stackgroup = "one"
))

fig.update_layout(yaxis_range = (0, 100))

fig.show()

- Will use `final_df' already defined when drawing Bar Chart. 

In [None]:
final_df.head()

In [None]:
year_val = final_df['year'].unique().tolist()

# final_df.groupby(["Q8", "Q5"]).size().unstack().fillna(0).astype("int16")
year_q5 = final_df.groupby(['year', 'Q5']).size().reset_index().rename(columns = {0:"Count"})
# print(year_q5.head())

year_q5 = year_q5[year_q5['Q5'].isin(['Student', 'Data Scientist', 'Data Analyst', 'Data Engineer', 'Software Engineer'])]

# 2019
q5_2019 = year_q5[year_q5['year'] == "2019"].reset_index(drop = True)
q5_2019['percentage'] = q5_2019["Count"] / q5_2019["Count"].sum()
q5_2019['%'] = np.round(q5_2019['percentage'] * 100, 1)

# 2020
q5_2020 = year_q5[year_q5['year'] == "2020"].reset_index(drop = True)
q5_2020['percentage'] = q5_2020["Count"] / q5_2020["Count"].sum()
q5_2020['%'] = np.round(q5_2020['percentage'] * 100, 1)

# 2021
q5_2021 = year_q5[year_q5['year'] == "2021"].reset_index(drop = True)
q5_2021['percentage'] = q5_2021["Count"] / q5_2021["Count"].sum()
q5_2021['%'] = np.round(q5_2021['percentage'] * 100, 1)

In [None]:
year_q5_df = pd.concat([q5_2019, q5_2020, q5_2021], ignore_index = True)
year_q5_final = pd.pivot(year_q5_df, index = "year", columns = "Q5", values = "%").reset_index()
year_q5_final

In [None]:
year_val = year_q5_final['year'].unique().tolist()

fig = go.Figure()

fig.add_trace(go.Scatter(
    x = year_val, 
    y = year_q5_final["Data Analyst"].tolist(), 
    mode = "lines", 
    name = "Data Analyst",
    line = dict(width = 0.5),
    stackgroup = "one"
))

fig.add_trace(go.Scatter(
    x = year_val, 
    y = year_q5_final["Data Engineer"].tolist(), 
    mode = "lines", 
    name = "Data Engineer",
    line = dict(width = 0.5),
    stackgroup = "one"
))

fig.add_trace(go.Scatter(
    x = year_val, 
    y = year_q5_final["Data Scientist"].tolist(), 
    name = "Data Scientist",
    mode = "lines", 
    line = dict(width = 0.5),
    stackgroup = "one"
))

fig.add_trace(go.Scatter(
    x = year_val, 
    y = year_q5_final["Software Engineer"].tolist(), 
    name = "Software Engineer",
    mode = "lines", 
    line = dict(width = 0.5),
    stackgroup = "one"
))

fig.add_trace(go.Scatter(
    x = year_val, 
    y = year_q5_final["Student"].tolist(), 
    name = "Student",
    mode = "lines", 
    line = dict(width = 0.5),
    stackgroup = "one"
))

fig.update_layout(yaxis_range = (0, 100))

fig.show()

## Legend Control
- Let's look at legend. 
- The main reference is https://plotly.com/python/legend/ if you are newbie, then please check here. 
- But, I want to share some more detail here. 
- let's make a position on top - right instead of just right on graph, default. 

In [None]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = [0, 1, 2], 
    y = [0, 1, 2], 
    name = "Trace 1"
))

fig.add_trace(go.Scatter(
    x = [0, 1, 2], 
    y = [0, 3, 4], 
    name = "Trace 2"
))

fig.update_layout(
        legend = dict(orientation = "h", yanchor = "bottom", y = 1, xanchor = "right", x = 1) # option 1 legend positioning
)

fig.show()

- Now, add title and color blue on legend variable. 

In [None]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = [0, 1, 2], 
    y = [0, 1, 2], 
    name = "Trace 1"
))

fig.add_trace(go.Scatter(
    x = [0, 1, 2], 
    y = [0, 3, 4], 
    name = "Trace 2"
))

fig.update_layout(
        legend = dict(font = dict(family = "Courier", size = 12, color = "blue"), # option 2
                  title = dict(text = "Legend Title")) # Legend Title
)

fig.show()

- Assume that want to change specific variable on legend.
- More importantly, need to access `fig.data`. 

In [None]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = [0, 1, 2], 
    y = [3, 1, 2],
    name = "Trace 1", 
    mode = "lines+markers",
    marker=dict(size=9, color='#ff9f43', symbol=18)
))

fig.add_trace(go.Scatter(
    x = [0, 1, 2], 
    y = [0, 3, 4], 
    name = "Trace 2",
    mode = "lines+markers", 
    marker=dict(size=9, color='blue', symbol=11)
))

print(fig.data)

t_name = ["Name 1"] # only change 1st name
for idx, name in enumerate(t_name):
    fig.data[idx].name = name

fig.show()

- Will be continued.. 

## Color Bar Control
- Color Bar on heatmap gives more information to readers, when compared two tables. 
- So, make_subplots() could be used to draw Heatmap. 
- Let's create two sample datasets. 

In [None]:
data_01 = pd.DataFrame({
    "JOB": ["A", "B", "C", "D", "E"], 
    "2021": [5120, 5115, 4555, 8790, 2393], 
    "2020": [3737, 3758, 3222, 6823, 2496],
    "2019": [4168, 4085, 4208, 4956, 2300],
    "2018": [8574.0, 253.0, 13222.0, 0.0, 1810.0],
    "2017": [3216.0, 4028.0, 3294.0, 0.0, 6177.0],
})

data_02 = pd.DataFrame({
    "JOB": ["A", "B", "C", "D", "E"], 
    "2021": [10, 500, 455, 890, 233], 
    "2020": [20, 80, 322, 683, 246],
    "2019": [15, 60, 408, 496, 230],
    "2018": [50, 253.0, 200.0, 10.0, 110.0],
    "2017": [300, 1503.0, 294.0, 40.0, 617.0],
})

- And draw Heatmap. Now, if you look at the number of colorbar, then it appears complicated. 
    + It's because two heatmaps shares all different numbers with only one colorbar. 

In [None]:
fig = make_subplots(rows=1, cols=2, print_grid=False,shared_yaxes=True,
                   subplot_titles=("World Heatmap", "EastAsia Heatmap"))

fig.add_trace(go.Heatmap(
                   z=data_01.iloc[:,[1,2,3,4,5]].to_numpy(),
                   y=data_01['JOB'].to_numpy(),
                   x=['2021','2020','2019','2018','2017'],
                   hoverongaps = False,
                   opacity=1.0, xgap=2.5, ygap=2.5),1,1
                   )

fig.add_trace(go.Heatmap(
                   z=data_02.iloc[:,[1,2,3,4,5]].to_numpy(),
                   y=data_02['JOB'].to_numpy(),
                   x=['2021','2020','2019','2018','2017'],
                   hoverongaps = False,
                   opacity=1.0, xgap=2.5, ygap=2.5),1,2 )

fig.show()

- To fix this problem, let's use coloraxis parameter.
    + As you can see, the numbers are unified. 
    + Ref. https://plotly.com/python/reference/layout/coloraxis/

In [None]:
fig = make_subplots(rows=1, cols=2, print_grid=False,shared_yaxes=True,
                   subplot_titles=("World Heatmap", "EastAsia Heatmap"))

fig.add_trace(go.Heatmap(
                   z=data_01.iloc[:,[1,2,3,4,5]].to_numpy(),
                   y=data_01['JOB'].to_numpy(),
                   x=['2021','2020','2019','2018','2017'],
                   hoverongaps = False,
                   opacity=1.0, xgap=2.5, ygap=2.5, coloraxis = "coloraxis"),1,1
                   )

fig.add_trace(go.Heatmap(
                   z=data_02.iloc[:,[1,2,3,4,5]].to_numpy(),
                   y=data_02['JOB'].to_numpy(),
                   x=['2021','2020','2019','2018','2017'],
                   hoverongaps = False,
                   opacity=1.0, xgap=2.5, ygap=2.5, coloraxis = "coloraxis"),1,2 )

fig.update_layout(title='<b>World vs EastAsia</b>',title_font_size=22,
                  margin = dict(t=120, l=100, r=10, b=150),
                  height=550, width=1500, 
                  coloraxis = dict(
                      showscale = True, 
                      colorscale = "orrd"
                  ))

fig.show()