# Importing Data and Required Library

In [160]:
import pandas as pd #pandas for working with dataframes
import numpy as np #numpy for arrays
data = pd.read_csv('data.csv') #reading the data given
data.head() #checking head of data

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


# Importing Bokeh Library

In [161]:
from bokeh.plotting import figure, show, output_file

# Data Cleaning

In [162]:
data.info() #checking basic info of data regarding the features and number of entries

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16598 non-null int64
Name            16598 non-null object
Platform        16598 non-null object
Year            16327 non-null float64
Genre           16598 non-null object
Publisher       16540 non-null object
NA_Sales        16598 non-null float64
EU_Sales        16598 non-null float64
JP_Sales        16598 non-null float64
Other_Sales     16598 non-null float64
Global_Sales    16598 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


# Checking number of NULL values

In [163]:
data.isnull().sum() #it will show missing values

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

# Droping NULL values of Year and Publisher column.

In [164]:
data = data.dropna(axis=0, subset=['Year','Publisher']) #we cannot impute missing year value and publisher value

In [165]:
data.isnull().sum() #now there is no missing value in our data

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

In [166]:
data['Year'] = data['Year'].apply(int)  #Converting all year values to integer values
data.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


# Preparing the Data

In [167]:
df = data[(data['Platform'] == 'PC') | (data['Platform'] == 'Wii') | (data['Platform'] == 'X360')]
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
7,8,Wii Play,Wii,2006,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62


In [168]:
df1 = df[['Platform','EU_Sales','JP_Sales']].reset_index()[['Platform','EU_Sales','JP_Sales']]
df1 = df1.sort_values('Platform')
df1.head()

Unnamed: 0,Platform,EU_Sales,JP_Sales
3461,PC,0.0,0.0
2700,PC,0.03,0.0
2699,PC,0.04,0.0
2693,PC,0.04,0.0
2689,PC,0.04,0.0


In [169]:
plat = list(df['Platform'])
EU = list(df['EU_Sales'])
JP = list(df['JP_Sales'])

In [170]:
color = []

In [171]:
df1.count()

Platform    3462
EU_Sales    3462
JP_Sales    3462
dtype: int64

In [172]:
for _ in range(938):
    color.append("#008000")

In [173]:
for _ in range(1290):
    color.append("#FFA500")

In [174]:
for _ in range(1234):
    color.append("#0000FF")

In [175]:
len(color)

3462

In [None]:
#008000
#FFA500
#0000FF

# Plotting using Bokeh

In [199]:
import pandas as pd
import bokeh.plotting as bpl
import bokeh.models as bmo
from bokeh.palettes import d3
bpl.output_notebook()
#colors = ["#%02x%02x%02x" % (int(r), int(g), 150) for r, g in zip(50+2*x, 30+2*y)] #colors
N = 2500 #Global_Sales size
radii = np.random.random(size=N)
x = np.array(EU)
y = np.array(JP)
df = pd.DataFrame(
    {
        
        "cat": plat,
        "kpi1": EU,
        "kpi2": JP
    }
)
source = bpl.ColumnDataSource(df)


# use whatever palette you want...
palette = d3['Category10'][len(df['cat'].unique())]
color_map = bmo.CategoricalColorMapper(factors=df['cat'].unique(),
                                   palette=palette)
catt = df['cat']
TOOLS="hover,crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"

# create figure and plot
p = figure(width=1000, height=600, tools=TOOLS,title="EU_Sales, JP_Sales Scatter Plot")
p.scatter(x, y, radius=radii, fill_color=color, fill_alpha=0.6, line_color=None,color={'field': catt, 'transform': color_map})
p.circle( x=0, y=0, radius=0, color='#008000', legend='PC')
p.circle( x=0, y=0, radius=0, color='#FFA500', legend='Wii')
p.circle( x=0, y=0, radius=0, color='#0000FF', legend='X360')

p.xaxis.axis_label = 'JP_Sales'
p.yaxis.axis_label = 'EU_Sales'
show(p)
output_file("Scatter_Plot.html") #file name



E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: cat [renderer: GlyphRenderer(id='ef1c08da-cafa-49d1-8b65-92504e503ed5', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: cat [renderer: GlyphRenderer(id='cf0fae13-4155-4ddf-88f2-a586ff677334', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: cat [renderer: GlyphRenderer(id='a9f9a5a0-c4a1-4d84-8c44-107e4a6dd390', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: cat [renderer: GlyphRenderer(id='8873052c-b1c4-44fa-992b-7fcda0151e11', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: cat [renderer: GlyphRenderer(id='4af1c012-3661-4bfa-9115-45d3c2a40c9e', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: cat [renderer: GlyphRenderer(id='9b1526af-ef6c-44bc-9a38-b6d8304966e2', ...)]
E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: cat [renderer: GlyphRenderer(id='f86031d5-ec53-4500-8a93-4c8cb9

# The code above will generate a html file containing the plot.