# Importing Data and Required Library

In [1]:
import pandas as pd #pandas for working with dataframes
data = pd.read_csv('data.csv') #reading the data given

# Importing Bokeh Library

In [2]:
from bokeh.plotting import figure, show, output_file

# Data Cleaning

In [3]:
data.head() #checking head of data

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [4]:
data.info() #checking basic info of data regarding the features and number of entries

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16598 non-null int64
Name            16598 non-null object
Platform        16598 non-null object
Year            16327 non-null float64
Genre           16598 non-null object
Publisher       16540 non-null object
NA_Sales        16598 non-null float64
EU_Sales        16598 non-null float64
JP_Sales        16598 non-null float64
Other_Sales     16598 non-null float64
Global_Sales    16598 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


# Checking number of NULL values

In [5]:
data.isnull().sum() #it will show missing values

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

# Droping NULL values of Year and Publisher column.

In [6]:
data = data.dropna(axis=0, subset=['Year','Publisher']) #we cannot impute missing year value and publisher value

In [7]:
data.isnull().sum() #now there is no missing value in our data

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

In [8]:
data['Year'] = data['Year'].apply(int)  #Converting all year values to integer values

In [9]:
data.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


# Grouping on Year and Global_Sales

In [10]:
new = pd.DataFrame(data.groupby(['Year','Global_Sales']).count()) #grouping on Global_Sales and Year

In [11]:
new

Unnamed: 0_level_0,Unnamed: 1_level_0,Rank,Name,Platform,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales
Year,Global_Sales,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1980,0.24,1,1,1,1,1,1,1,1,1
1980,0.27,1,1,1,1,1,1,1,1,1
1980,0.34,1,1,1,1,1,1,1,1,1
1980,0.49,1,1,1,1,1,1,1,1,1
1980,0.77,1,1,1,1,1,1,1,1,1
1980,1.05,1,1,1,1,1,1,1,1,1
1980,1.15,1,1,1,1,1,1,1,1,1
1980,2.76,1,1,1,1,1,1,1,1,1
1980,4.31,1,1,1,1,1,1,1,1,1
1981,0.13,1,1,1,1,1,1,1,1,1


In [12]:
mno = data.groupby('Year')['Global_Sales'].sum() #year wise Global_Sales values
mno #checking dataframe

Year
1980     11.38
1981     35.77
1982     28.86
1983     16.79
1984     50.36
1985     53.94
1986     37.07
1987     21.74
1988     47.22
1989     73.45
1990     49.39
1991     32.23
1992     76.16
1993     45.98
1994     79.17
1995     88.11
1996    199.15
1997    200.98
1998    256.47
1999    251.27
2000    201.56
2001    331.47
2002    395.52
2003    357.85
2004    414.01
2005    458.51
2006    521.04
2007    609.92
2008    678.90
2009    667.30
2010    600.29
2011    515.80
2012    363.49
2013    368.11
2014    337.03
2015    264.44
2016     70.90
2017      0.05
2020      0.29
Name: Global_Sales, dtype: float64

In [13]:
ext = list(mno.reset_index()['Global_Sales'].loc[30:35]) #extracting the required values

# Plotting Using Bokeh

In [14]:
output_file('hbar.html')
p = figure(plot_width=1000, plot_height=500,title="Global_Sales by Year")
p.hbar(y=[2010,2011,2012,2013,2014,2015], height=0.8, left=0,
       right=ext, color="#02b8ab")
# change just some things about the y-grid
p.ygrid.minor_grid_line_color = 'navy'
p.ygrid.minor_grid_line_alpha = 0.1
show(p)

# The code above will generate a html file containing the plot.