## A beginner's analysis of the Video Game Sales dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
data_path = "../input/videogamesales/vgsales.csv"
df = pd.read_csv(data_path)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.describe(include='all')

In [None]:
df.info()

In [None]:
missing_data = df.isnull()
missing_data.head(5)

In [None]:
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

In [None]:
df[['Year']]

In [None]:
df.dropna(inplace=True) # removing all null/missing values

In [None]:
df['Year'] = df['Year'].astype(int)
df['Year']

In [None]:
df.head(10)

In [None]:
%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot
import matplotlib.cm as cm


In [None]:
scatter_colors = np.random.random_sample((11,1481))
# scatter_colors = cm.rainbow(np.linspace(0, 1, 16291))
pyplot.scatter(df['Year'],df['Global_Sales'], c=scatter_colors, cmap='rainbow')

In [None]:
publishers = df[['Publisher','Year','Global_Sales']]
publishers.head(15)

In [None]:
yers=df.groupby('Year')['Global_Sales'].sum()
yers.plot()

In [None]:
publisher_sales= df.groupby(['Publisher'],as_index=False).sum()
publisher_sales

In [None]:
publisher_sales['Global_Sales'].describe(include='all')

In [None]:
## learning binning
# bins = np.linspace(min(publisher_sales["Global_Sales"]), max(publisher_sales["Global_Sales"]), 5)
# bins = np.linspace(0.0,100, 4)
bins = np.array([min(publisher_sales["Global_Sales"]),publisher_sales['Global_Sales'].mean(),102.865446,max(publisher_sales["Global_Sales"])])
bins

In [None]:
group_names = ['average','better-sellers','best-sellers']

In [None]:
publisher_sales["Global_Sales_Binned"] = pd.cut(publisher_sales["Global_Sales"], bins, labels=group_names, include_lowest=True )
publisher_sales[['Global_Sales','Global_Sales_Binned']].head(20)

In [None]:
publisher_sales["Global_Sales_Binned"].value_counts()

In [None]:
publisher_sales.loc[publisher_sales["Global_Sales_Binned"]=='best-sellers']

In [None]:
publisher_sales.loc[publisher_sales["Global_Sales_Binned"]=='better-sellers']

In [None]:
best=publisher_sales.loc[publisher_sales["Global_Sales_Binned"]=='best-sellers']
best['Publisher']

In [None]:
%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot
pyplot.bar(group_names, publisher_sales["Global_Sales_Binned"].value_counts())

# set x/y labels and plot title
plt.pyplot.xlabel("Global Sales")
plt.pyplot.ylabel("count")
plt.pyplot.title("Global")

In [None]:
plt.style.use("seaborn")
pyplot.barh(best['Publisher'], best['Global_Sales'])

# # set x/y labels and plot title
plt.pyplot.xlabel("Global Sales in millions")
plt.pyplot.title("best selling publishers by Global video game sales (1980-2020)")

### finding the best-sellers for each year since 1980

In [None]:

##? a publishers group containing global sales for each year and each game
pubgrp= df[["Publisher","Year","Global_Sales"]]
pubgrp

In [None]:
#creating a dataframe of highest selling publisher for each year
mydf = pubgrp.groupby(['Year','Publisher']).agg({'Global_Sales':np.sum})
mydf= mydf.groupby('Year')['Global_Sales'].nlargest(1).reset_index(level=1,drop=True)

In [None]:
mydf #dataframe of highest selling publisher for each year

In [None]:
years = np.array([year[0] for year in mydf.index])
years

In [None]:
sales=mydf.loc[years].values
sales

In [None]:
pubs = np.array([year[1] for year in mydf.index])
# pubs = np.unique(pubs)
pubs

In [None]:
mydf.loc[years[0]].index

In [None]:



fig, ax = pyplot.subplots(figsize=(20,10))
plt.style.use("seaborn")
bars = pyplot.bar(years, mydf.loc[years].values)
ax.bar_label(bars, fmt='%.2f')

ax.set_xticks(years)

plt.pyplot.xlabel("Global Sales in millions")
plt.pyplot.title("best selling publishers by Global video game sales (1980-2020)")

In [None]:
mydf.plot(kind='barh',figsize=(15,12),use_index=True,title='Top publisher for 1980-2020',fontsize=14,colormap='jet')