In [6]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.figure_factory as ff

import matplotlib as plt
%matplotlib inline
import seaborn as sns

In [2]:
games_df = pd.read_csv('https://raw.githubusercontent.com/obulygin/SkillFactory/main/vgsales.csv')
games_df.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [3]:
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16717 non-null  object 
 1   Platform         16719 non-null  object 
 2   Year_of_Release  16450 non-null  float64
 3   Genre            16717 non-null  object 
 4   Publisher        16665 non-null  object 
 5   NA_Sales         16719 non-null  float64
 6   EU_Sales         16719 non-null  float64
 7   JP_Sales         16719 non-null  float64
 8   Other_Sales      16719 non-null  float64
 9   Global_Sales     16719 non-null  float64
 10  Critic_Score     8137 non-null   float64
 11  Critic_Count     8137 non-null   float64
 12  User_Score       10015 non-null  object 
 13  User_Count       7590 non-null   float64
 14  Developer        10096 non-null  object 
 15  Rating           9950 non-null   object 
dtypes: float64(9), object(7)
memory usage: 2.0+ MB


In [4]:
data = games_df.copy()
data = data[data['Year_of_Release'].notna()]
data['User_Score'] = data.User_Score.replace('tbd', np.nan).astype('float64')

data['User_Count'] = data.User_Count.replace('<NA>', np.nan).astype('float64')
data['Critic_Count'] = data.Critic_Count.replace('<NA>', np.nan).astype('float64')

data['Year_of_Release'] = data.Year_of_Release.astype('Int64')

data['User_Score'] = data.User_Score * 10
data = data.sort_values('Year_of_Release', ascending=True, ignore_index=True)
data.tail()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
16445,Warhammer: The End Times - Vermintide,PS4,2016,Action,Games Workshop,0.01,0.02,0.0,0.01,0.04,,,,,,
16446,Phantasy Star Online 2 Episode 4: Deluxe Package,PS4,2017,Role-Playing,Sega,0.0,0.0,0.04,0.0,0.04,,,,,,
16447,Phantasy Star Online 2 Episode 4: Deluxe Package,PSV,2017,Role-Playing,Sega,0.0,0.0,0.01,0.0,0.01,,,,,,
16448,Brothers Conflict: Precious Baby,PSV,2017,Action,Idea Factory,0.0,0.0,0.01,0.0,0.01,,,,,,
16449,Imagine: Makeup Artist,DS,2020,Simulation,Ubisoft,0.27,0.0,0.0,0.02,0.29,,,,,Ubisoft,E


In [5]:
data.isnull().sum()

Name                  2
Platform              0
Year_of_Release       0
Genre                 2
Publisher            32
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8467
Critic_Count       8467
User_Score         8987
User_Count         8987
Developer          6543
Rating             6681
dtype: int64

In [7]:
null_data = data.isnull().sum()
cols = null_data[null_data > 0].index

fig = px.imshow(
    data[cols].isnull().astype('int'),
    labels=dict(x='Columns', y='Rows sorted by year'),
    title='Heatmap of NaN'
)
fig.show()

In [8]:
data = data.fillna({
    'Genre': data['Genre'].mode()[0],
    'Publisher': data['Publisher'].mode()[0]
})
data = data.dropna(subset=['Name'])

In [None]:
fig = px.histogram(
    data_frame= data,
    x = 'Critic_Score',
    nbins=100,
    title='Distribution of critic score',
    width=500,
    height=300,
    marginal='box', #Функция доп графика (rug, box, violin)
    #histogram='percent
)

fig.show()

In [13]:
fig = px.histogram(
    data,
    x="User_Count",
    y='Genre',
    color='Genre',
    title='Distribution of user count by genre',
    width=700,
    height=500
)
fig.show()

In [14]:
fig = px.box(
    data_frame=data,
    x='User_Score',
    y='Genre',
    color='Genre',
    title='Distribution of user by genre',
    width=600,
    height=400
)

fig.show()

In [None]:
fig = px.violin(
    data,
    x='User_Score',
    y='Genre',
    color = 'Genre',
    title='Distribution of user score by genre',
    width=700,
    height=700,
    points='all'
)

fig.show()

# Violin - график, сочетающий в себе И гистограмму И коробчатую диаграмму

In [21]:
line_data = data.groupby('Year_of_Release', as_index=False)['Global_Sales'].sum()
fig = px.line(
    line_data,
    x='Year_of_Release',
    y='Global_Sales',
    title='Dinamics of video games sales',
    width=800,
    height=300
)

fig.show()

In [None]:
fig = px.pie(
    data_frame=data,
    names='Genre',
    height=500,
    width=700,
    title='Ratio of genres',
    hole=0.1 # Размер отверстия по центру (можно и без него)
)

fig.show()

In [None]:
# Создаем переменную bar_data, куда сгруппируем данные по жанру с параметром Global_Sales и суммируем все это, чтобы посмотреть общую сумму продаж по каждому жанру, 
# образовав !!!Data Frame!!! и переименуем столбцы методом rename(index => Genre, a Genre => Count) и методом reset_index() - создаем новые индексы
bar_data = data.groupby('Genre')['Global_Sales'].sum().reset_index()#.rename({'index': 'Genre', 'Genre': 'Count'})

fig = px.bar(
    data_frame= bar_data,
    x='Genre',
    y='Global_Sales',
    color='Genre',
    height=500,
    width=700,
    title='Ratio of genres'
)

fig.show()

Т.к. круг - это всегда 100%, при удалении из визуализации какого-либо элемента - происходит пересчет данных, что приводит к некорректным наблюдениям, а в столбчатой диаграмме такого нет

Построим МНОГОУРОВНЕВУЮ столбчатую диаграмму: по оси У расположим года продаж, по оси Х - сумму продаж, для каждого года выведем платформы игр, чтобы в конечном итоге видеть за каждый год сумму продаж по платформам:

In [40]:
# Группируем данные по двум категориальным признакам (платформа и год релиза), считаем суммарные глобальные продажи
top_data = data.groupby(['Platform', 'Year_of_Release'], as_index=False)['Global_Sales'].sum()
# Задаем колонки DataFrame
top_data.columns = ['Platform', 'Year_of_Release', 'Sum']
# Фильтруем игры на те, которые идут ПОСЛЕ 2000 года
top_data = top_data[top_data['Year_of_Release'] > 2000]
# Преобразуем тип данных столбца Year_of_Release в тип данных OBJECT для отображения по оси Х, потому что там важно, чтобы шла именно строка, а тут у нас тип данных ЧИСЛО
top_data['Year_of_Release'] = top_data['Year_of_Release'].astype('object')

fig = px.bar(
    data_frame=top_data,
    x='Year_of_Release',
    y='Sum',
    color='Platform',
    barmode='group',
    title='Total released viedo-games by platform'
)
fig.update_xaxes(type='category', categoryorder= 'category ascending')
fig.show()