In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/videogamesales/vgsales.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
len(df.Platform.unique())

In [None]:
df[df.Year <= 1980]

In [None]:
df.pivot_table(index='Platform', columns='Genre', values='Global_Sales', aggfunc='sum').dropna()

In [None]:
df.pivot_table(index='Publisher', values='Global_Sales', aggfunc='sum').sort_values(by='Global_Sales', ascending=False)

In [None]:
df.pivot_table(index=['Platform'], values='Global_Sales', aggfunc='sum').sort_values(by='Global_Sales', ascending=False).head()

In [None]:
# Global sales drop after 2010...little to no data after 2016
df.pivot_table(index=['Year'], values='Global_Sales', aggfunc='sum').sort_values(by='Year', ascending=False)

In [None]:
len(set(df[df['Year'] == 2015]['Name']))

In [None]:
# confirmed num of game sales reported quickly dropped off, peaking 2009-2010
len(set(df[df['Year']==2010]['Name']))

In [None]:
# dropping rows after 2010
df = df[df['Year']<=2010]

In [None]:
# pivot again without dropped data, using groupby instead of pivot table
df.groupby('Platform').agg({'Global_Sales':'sum'}).sort_values('Global_Sales', ascending=False).head()

In [None]:
# best sales by platform genre
df.groupby(['Platform','Genre']).agg({'Global_Sales':'sum'}).sort_values('Global_Sales', ascending=False).head()

In [None]:
# best years for sales are 2006-2010
df.groupby('Year').agg({'Global_Sales':'sum'}).sort_values('Global_Sales', ascending=False).head()

In [None]:
# best selling games of all time
df_games = df.groupby('Name').agg({'Global_Sales':'sum'})
df_games.sort_values('Global_Sales', ascending=False).head()

In [None]:
# make new column to find num of games per year
df_year = df.groupby('Year').agg({'Global_Sales':'sum'})
df_year.sort_values('Global_Sales', ascending=False).head()

In [None]:
# vg = sns.load_dataset(df)
sns.jointplot(x='Global_Sales', y='Year', data=df)

In [None]:
sns.distplot(df['Year'], kde=False, bins=30, color='red')

In [None]:
# only correlations are country sales to global sales
dfc = df.corr()
sns.heatmap(dfc, cmap='coolwarm') 

In [None]:
plt.style.use('ggplot')
df.plot.area(alpha=0.4)
plt.show()

In [None]:
df.plot.scatter(x='Global_Sales',y='Year', cmap='seismic')
plt.show()

In [None]:
df['Year'].hist()

In [None]:
games_per_year = df.groupby('Year')['Name'].nunique()
games_per_year

In [None]:
ax = sns.scatterplot(x=df.Year, y=df.Publisher,
                     hue=games_per_year, size=games_per_year, sizes=(20, 400), legend=False)