# **Video Games - Exploratory Data Analysis**

Dataset - [Video Games](https://www.kaggle.com/datasets/shivamvadalia27/video-games)

In [1]:
# imports

import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

In [2]:
# load dataset
df = pd.read_csv('/kaggle/input/video-games/games.csv')

# first 5 records of the dataset
df.head()

Unnamed: 0,id,name,released,added,playtime,reviews_count,ratings_count,rating
0,3498,Grand Theft Auto V,2013-09-17,19385,73,6421,6332,4.47
1,3328,The Witcher 3: Wild Hunt,2015-05-18,18567,46,6122,6029,4.66
2,4200,Portal 2,2011-04-18,17494,11,5310,5263,4.62
3,5286,Tomb Raider (2013),2013-03-05,15290,10,3686,3660,4.05
4,4291,Counter-Strike: Global Offensive,2012-08-21,15174,65,3267,3236,3.57


In [3]:
# to get number of rows and columns along with column names and its types

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20580 entries, 0 to 20579
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             20580 non-null  int64  
 1   name           20580 non-null  object 
 2   released       19894 non-null  object 
 3   added          20580 non-null  int64  
 4   playtime       20580 non-null  int64  
 5   reviews_count  20580 non-null  int64  
 6   ratings_count  20580 non-null  int64  
 7   rating         20580 non-null  float64
dtypes: float64(1), int64(5), object(2)
memory usage: 1.3+ MB


In [4]:
# check for null values

df.isnull().sum()

id                 0
name               0
released         686
added              0
playtime           0
reviews_count      0
ratings_count      0
rating             0
dtype: int64

In [5]:
# remove null values from main dataframe

df.dropna(inplace=True)

In [6]:
# check for duplicates

df.duplicated().sum()

580

In [7]:
# remove duplicates

df.drop_duplicates(inplace=True)

In [8]:
# change to datetime type
df['released'] = pd.to_datetime(df['released'])

# add columns
df['year'] = df['released'].dt.year

In [9]:
# drop column

df = df.drop(columns='released')

# rename column

df = df.rename(columns={'added': 'favorites', 'reviews_count': 'reviews', 'ratings_count': 'ratings#'})

In [10]:
# check for zero values in df

(df == 0).sum(axis=0)

id              0
name            0
favorites       0
playtime     2729
reviews      1020
ratings#     1035
rating       6325
year            0
dtype: int64

In [11]:
# import numpy
from numpy import nan

# replace zeros with nan
df.replace(0, nan, inplace=True)

# remove the null values
df.dropna(inplace=True)

# change dtypes back to int
df = df.astype({'playtime': int, 'reviews': int, 'ratings#': int})

In [12]:
df.describe()

Unnamed: 0,id,favorites,playtime,reviews,ratings#,rating,year
count,10566.0,10566.0,10566.0,10566.0,10566.0,10566.0,10566.0
mean,91944.77068,874.626254,4.471134,124.881412,123.552527,3.192697,2013.592656
std,174195.677555,1484.556323,14.064481,331.716624,328.222443,0.757678,6.367903
min,2.0,45.0,1.0,6.0,4.0,1.0,1979.0
25%,11382.25,170.0,1.0,12.0,11.0,2.67,2011.0
50%,19379.5,361.0,3.0,27.0,26.0,3.29,2015.0
75%,49283.75,877.0,4.0,85.0,83.0,3.79,2018.0
max,949868.0,19385.0,900.0,6421.0,6332.0,4.8,2025.0


In [13]:
# boxplot for visualising the stats

fig = make_subplots(5, 1)

fig.add_trace(go.Box(x=df['favorites'], name='Favorites', boxmean=True), 1, 1)
fig.add_trace(go.Box(x=df['playtime'], name='Playtime', boxmean=True), 2, 1)
fig.add_trace(go.Box(x=df['reviews'], name='Reviews', boxmean=True), 3, 1)
fig.add_trace(go.Box(x=df['ratings#'], name='# of Ratings', boxmean=True), 4, 1)
fig.add_trace(go.Box(x=df['rating'], name='Rating', boxmean=True), 5, 1)

fig.update_layout(title='Measures of Data(Central Tendency, Dispersion & Position)', height=600)

iplot(fig)

**This dataset is completely right skewed**

In [14]:
fig = px.histogram(df, x='year', text_auto=True)

fig.update_layout(xaxis={'categoryorder':'category descending'}, title='# of video games released by year')

iplot(fig)

In [15]:
# video games released after 2005

df = df[df['year'] > 2005]

In [16]:
# video games with number of rating > 90 percentile (more than 566)

df = df[df['ratings#'] > df['ratings#'].quantile(.95)]

In [17]:
# high rated games with rating > 80 percentile (4.31 to 4.69)
high = df[df['rating'] >= df['rating'].quantile(.80)]

# avg rated games with rating between 60 and 80 percentile (4.1 to 4.3)
avg = df[(df['rating'] >= df['rating'].quantile(.60)) & (df['rating'] < df['rating'].quantile(.80))]

# low rated games with rating between 40 and 80 percentile (3.9 to 4.09)
low = df[(df['rating'] >= df['rating'].quantile(.40)) & (df['rating'] < df['rating'].quantile(.60))]

In [18]:
dfs = []

for i in [high, avg, low]:
    fav = i.nlargest(30, 'favorites')
    pt = i.nlargest(30, 'playtime')
    
    merged = fav.merge(pt)
    dfs.append(merged)

In [19]:
top_games = pd.concat(dfs)

top_games.reset_index(drop=True, inplace=True)

In [20]:
top_games[top_games['playtime'] >= 15]

Unnamed: 0,id,name,favorites,playtime,reviews,ratings#,rating,year
0,3498,Grand Theft Auto V,19385,73,6421,6332,4.47,2013
1,3328,The Witcher 3: Wild Hunt,18567,46,6122,6029,4.66,2015
2,5679,The Elder Scrolls V: Skyrim,14654,46,4375,4332,4.42,2011
3,28,Red Dead Redemption 2,14059,19,4616,4524,4.59,2018
4,2551,Dark Souls III,9555,28,2449,2431,4.4,2016
5,3636,The Last Of Us Remastered,8476,16,2759,2732,4.69,2014
6,4806,Mass Effect 2,8469,21,2774,2755,4.48,2010
9,41494,Cyberpunk 2077,11303,25,2281,2199,4.11,2020
10,3192,Metal Gear Solid V: The Phantom Pain,10885,24,2376,2359,4.15,2015
11,3287,Batman: Arkham Knight,10566,20,2425,2407,4.24,2015


##  Findings

Top 20 video games out of 20K records, based on favorites, playtime and ratings

The most number of video games released around 2014 to 2018