# Import all necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Load data

In [None]:
# read the data
df = pd.read_csv('../input/videogamesales/vgsales.csv')

# Inital data check for data types and missing values

In [None]:
# check few rows
df.head()

In [None]:
#Count of how many row and columns
df.shape


In [None]:
#Check for data types
df.info()

Looks like there are few missing values on year and publisher columns

#  Percentage of missing Values

We can safely drop the missing values rows since there are only couple of columns that has missing values and its less 2%

In [None]:
missing_values_perct = df.isna().sum()*100/df.shape[0]
missing_values_perct

In [None]:
df.dropna(inplace = True)


In [None]:
#confirm missin values are dropped
df.isna().sum()

Great! All missing values has been dropped

# Data Visualizations 

# Top 100 Global Sales Publishers

In [None]:
top100 = df.head(100)
plt.figure(figsize=(10,5))
ax = sns.swarmplot(x = 'Publisher',y = 'Global_Sales', data = top100, alpha = 0.8).set_title("Top 100 globally sold games")
plt.xticks(rotation = 90)

Nintendo a clear winner in global sales.

# Games with most Genre


In [None]:
plt.figure(figsize=(15,8))
sns.set_style('darkgrid')
ax= sns.countplot(x = 'Genre', data = df,  order = df['Genre'].value_counts().index).set_title('Top Genre Games ')
plt.xticks(rotation =90)

Action genre are by far most loved and released genre

# Each genre of game sale by region

In [None]:
group_by_genre = df.groupby('Genre').sum().loc[:,'NA_Sales':'Other_Sales']
plt.figure(figsize=(15,10))
sns.set_style('darkgrid')

ax = sns.heatmap(group_by_genre,annot=True,fmt = '.1f').set_title('Comparision for each genre and region')


# Global sales on platform

In [None]:
plt.figure(figsize=(15,10))
sns.set_style('darkgrid')
ax = sns.barplot(x = 'Platform',y = 'Global_Sales', data = top100, ci = None, palette = 'bright').set_title("Top 100 globally sold games on platform")
plt.xticks(rotation = 90)



# North America VS Europe Sale

In [None]:
plt.figure(figsize=(15,10))
sns.set_style('darkgrid')
sns.lineplot(x = 'Year', y = 'NA_Sales', data = df, color = 'red',ci = None, label = 'North America')
sns.lineplot(x = 'Year', y = 'EU_Sales', data = df, color = 'blue',ci = None,label = 'Europe')

plt.ylabel('Sales')
plt.title("Sales comparison North America VS Europe")






# Number of sale by year(in millions)

In [None]:
plt.figure(figsize=(15,10))
sns.set_style('darkgrid')

yearly_sale = df[['Year','Global_Sales']].groupby('Year').sum().reset_index()
yearly_sale['Year'] = yearly_sale['Year'].astype(int)

sns.barplot(x = 'Year',y= 'Global_Sales',data = yearly_sale).set_title('Global total sale yearly')
plt.xticks(rotation =90)

Looks like video games were extreamly popular in between 2005 to 2010