# **Data Analysis On Video Game Sales And Ratings**
Dataset from: https://www.kaggle.com/datasets/rush4ratio/video-game-sales-with-ratings

## **0. Imports**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## **1. Exploratory Data Analysis**

### **1.1 Overview**

In [None]:
video_games_df = pd.read_csv('Video_Games_Sales_as_at_22_Dec_2016.csv')

print('Number of data points = {}'.format(video_games_df.shape[0]))
print('Shape: {}'.format(video_games_df.shape))
print('Columns: {}'.format(list(video_games_df.columns)))

In [None]:
print(video_games_df.info())
video_games_df.head()

### **1.2 Missing Values**

In [None]:
def display_missing(df):
    nummissing = 0
    for col in df.columns.tolist():
        if df[col].isnull().sum() > 0:
            num_missing = df[col].isnull().sum()
            percent_missing = (num_missing / df.shape[0]) * 100
            print('{} column missing values: {} ({:.2f}%)'.format(col, num_missing, percent_missing))
            nummissing += 1
    if nummissing == 0:
        print('No missing values in the dataset')
    print('\n')

display_missing(video_games_df)

In [None]:
video_games_df.dropna(subset=['Name', 'Genre', 'Publisher', 'Year_of_Release'], inplace=True)
display_missing(video_games_df)

### **1.3 Unique Values**

In [None]:
print('Number of data points = {}'.format(video_games_df.shape[0]))
print('Number of unique values: ')
video_games_df.nunique()

### **1.4 Data Types**

In [None]:
video_games_df.dtypes

In [None]:
video_games_df['User_Score'].value_counts()

In [None]:
video_games_df['Year_of_Release'] = video_games_df['Year_of_Release'].astype(np.int64)
video_games_df['User_Score'] = video_games_df['User_Score'].str.replace('tbd', 'NaN')
video_games_df['User_Score'] = video_games_df['User_Score'].astype(np.float64)

print(video_games_df.dtypes)

video_games_df.head()

In [None]:
video_games_df.describe()

### **1.5 Ratings**

In [None]:
video_games_df['Rating'].value_counts()

In [None]:
video_games_df.loc[video_games_df.Rating == 'RP']

In [None]:
video_games_df.loc[14272, 'Rating'] = 'E10+'

In [None]:
video_games_df['Rating'] = video_games_df['Rating'].str.replace('EC', 'E')
video_games_df['Rating'] = video_games_df['Rating'].str.replace('K-A', 'E')
video_games_df['Rating'] = video_games_df['Rating'].str.replace('AO', 'M')
print('Unique Ratings: {}'.format(video_games_df['Rating'].unique()))

rating_order = {'E':'1.0', 'E10+':'2.0', 'T':'3.0', 'M':'4.0'}
inv_rating_order = {v: k for k, v in rating_order.items()}
video_games_df['Rating'] = video_games_df['Rating'].map(rating_order)

video_games_df['Rating'] = video_games_df['Rating'].astype(np.float64)
print('Unique Ratings: {}'.format(video_games_df['Rating'].unique()))

# 

### **1.6 Correlations**

In [None]:
sns.set_theme(style="darkgrid")
fig, axs = plt.subplots(nrows=1, figsize=(9, 9))
c = video_games_df.corr()
sns.heatmap(video_games_df.corr(),  annot=True, square=True, cmap='coolwarm')
plt.show()

## **2. Further Analysis**

### **2.1 Sales**

In [None]:
sales_features = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']

video_games_df[sales_features].describe()

Since the global sales mean is between the 75th percentile and the max value, we can see that the average is skewed upward by large outliers and is much larger than the median value.

In [None]:
plt.figure(figsize=(15,5))
plt.title('Distribution of Global Game Sales')
sns.histplot(x='Global_Sales', data=video_games_df)
plt.ylim(ymin=0,ymax=1400)
plt.show()

In [None]:
video_games_less_than_3 = video_games_df[video_games_df['Global_Sales'] < 3]

plt.figure(figsize=(15,5))
plt.title('Distribution of Global Game Sales (Less Than $3 million)')
sns.histplot(x='Global_Sales', data=video_games_less_than_3)
plt.xlim(xmin=0,xmax=3)
plt.show()

### **2.2 Ratings**

In [None]:
ratings_features = ['Critic_Score', 'Critic_Count', 'User_Score', 'User_Count']

video_games_df[ratings_features].describe()

In [None]:
plt.figure(figsize=(12,5))
plt.title('Distribution of User Scores')
sns.histplot(x='User_Score', data=video_games_df)
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.title('Distribution of Critic Scores')
sns.histplot(x='Critic_Score', data=video_games_df)
plt.show()

### **2.3 Genre**

In [None]:
plt.figure(figsize=(12,5))
plt.title('Number of Games In Each Genre')
ax = sns.countplot(y="Genre", data=video_games_df, order=video_games_df['Genre'].value_counts().index)
plt.ylabel('Number of Games')
plt.show()

#### **Genre and Sales**

In [None]:
plt.figure(figsize=(12,5))
plt.title("Total Global Sales of Video Games by Genre")
genre_sum_df = video_games_df.groupby('Genre', as_index=False).sum()
sns.barplot(data=genre_sum_df, x='Global_Sales', y='Genre', order=genre_sum_df.sort_values('Global_Sales', ascending=False).Genre)
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.title("Distribution of Global Sales of Video Games by Genre")
sns.boxplot(data=video_games_df, x='Global_Sales', y='Genre')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.title("Average Global Sales of Video Games by Genre")
genre_mean_df = video_games_df.groupby('Genre', as_index=False).mean()
sns.barplot(data=genre_mean_df, x='Global_Sales', y='Genre', order=genre_mean_df.sort_values('Global_Sales', ascending=False).Genre)
plt.show()

### **2.4 ESRB Rating**

In [None]:
video_games_df['Rating'] = video_games_df['Rating'].astype(str)
video_games_df['Rating'] = video_games_df['Rating'].map(inv_rating_order)

plt.figure(figsize=(12,4))
plt.title('Number of Games With Each ESRB Rating')
ax = sns.countplot(y="Rating", data=video_games_df, order=video_games_df['Rating'].value_counts().index)
plt.ylabel('Number of Games')
plt.show()

#### **Ratings and Sales**

In [None]:
plt.figure(figsize=(12,4))
plt.title("Total Global Sales of Video Games by Rating")
rating_sum_df = video_games_df.groupby('Rating', as_index=False).sum()
sns.barplot(data=rating_sum_df, x='Global_Sales', y='Rating', order=rating_sum_df.sort_values('Global_Sales', ascending=False).Rating)
plt.show()

In [None]:
plt.figure(figsize=(12,4))
plt.title("Distribution of Global Sales of Video Games by Rating")
sns.boxplot(data=video_games_df, x='Global_Sales', y='Rating')
plt.show()

In [None]:
plt.figure(figsize=(12,4))
plt.title("Average Global Sales of Video Games by Rating")
rating_mean_df = video_games_df.groupby('Rating', as_index=False).mean()
sns.barplot(data=rating_mean_df, x='Global_Sales', y='Rating', order=rating_mean_df.sort_values('Global_Sales', ascending=False).Rating)
plt.show()

### **2.5 Platform**

In [None]:
plt.figure(figsize=(12,6))
plt.title('Number of Games on Each Platform')
ax = sns.countplot(y="Platform", data=video_games_df, order=video_games_df['Platform'].value_counts().index)
plt.ylabel('Number of Games')
plt.show()

#### **Platform and Sales**

In [None]:
plt.figure(figsize=(12,6))
plt.title("Total Global Sales of Video Games by Platform")
platform_sum_df = video_games_df.groupby('Platform', as_index=False).sum()
sns.barplot(data=platform_sum_df, x='Global_Sales', y='Platform', order=platform_sum_df.sort_values('Global_Sales', ascending=False).Platform)
plt.show()

In [None]:
plt.figure(figsize=(14,6))
plt.title("Distribution of Global Sales of Video Games by Platform")
sns.boxplot(data=video_games_df, y='Global_Sales', x='Platform')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Average Global Sales of Video Games by Platform")
platform_mean_df = video_games_df.groupby('Platform', as_index=False).mean()
sns.barplot(data=platform_mean_df, x='Global_Sales', y='Platform', order=platform_mean_df.sort_values('Global_Sales', ascending=False).Platform)
plt.show()

#### **Muliplatform Games**

In [None]:
multiplatform_titles = video_games_df[video_games_df['Name'].duplicated(keep=False)]
multiplatform_titles['Name'].value_counts()

In [None]:
video_games_df.loc[video_games_df.Name == 'Need for Speed: Most Wanted']

In [None]:
plt.figure(figsize=(12,6))
plt.title('Number of Multiplatform Games on Each Platform')
ax = sns.countplot(y="Platform", data=multiplatform_titles, order=multiplatform_titles['Platform'].value_counts().index)
plt.ylabel('Number of Games')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Total Sales of Multiplatform Video Games by Platform")
platform_sum_df = multiplatform_titles.groupby('Platform', as_index=False).sum()
sns.barplot(data=platform_sum_df, x='Global_Sales', y='Platform', order=platform_sum_df.sort_values('Global_Sales', ascending=False).Platform)
plt.show()

In [None]:
plt.figure(figsize=(12,6))
plt.title("Average Global Sales of Multiplatform Video Games by Platform")
platform_mean_df = multiplatform_titles.groupby('Platform', as_index=False).mean()
sns.barplot(data=platform_mean_df, x='Global_Sales', y='Platform', order=platform_mean_df.sort_values('Global_Sales', ascending=False).Platform)
plt.show()

### **2.6 Year of Release**

In [None]:
plt.figure(figsize=(18,6))
plt.title('Number of Games Released Each Year')
ax = sns.countplot(x="Year_of_Release", data=video_games_df)
plt.ylabel('Number of Games')
plt.show()

#### **Year of Release and Sales**

In [None]:
year_sum_df = video_games_df.groupby('Year_of_Release', as_index=False).sum()
plt.figure(figsize=(12,4))
plt.title("Total Global Sales of Video Games by Genre")
ax = sns.lineplot(x="Year_of_Release", y='Global_Sales', data=year_sum_df)
plt.ylabel('Global Sales')
plt.show()

In [None]:
year_mean_df = video_games_df.groupby('Year_of_Release', as_index=False).mean()
plt.figure(figsize=(12,4))
plt.title("Average Global Sales of Video Games by Genre")
ax = sns.lineplot(x="Year_of_Release", y='Global_Sales', data=year_mean_df)
plt.ylabel('Global Sales')
plt.show()