# Vedio Game Dataset Analysis

In [1]:
# Importing pandas
import pandas as pd


**Read csv file**

In [56]:
df=pd.read_csv('VideoGamesSales.csv')
df

Unnamed: 0,Rank,Name,Platform,Year,Month,Genre,Publisher,Country,City,State,Region,NA_Sales,Global_Sales,NA_Profit,Global_Profit
0,1,Wii Sports,Wii,2010,Jan,Sports,Nintendo,United States,Fairfield,California,West,$41.49,82.74,12.447,24.822
1,2,Super Mario Bros.,NES,2010,Feb,Platform,Nintendo,United States,Edmonds,Washington,West,$29.08,40.24,8.724,12.072
2,3,Mario Kart Wii,Wii,2010,Mar,Racing,Nintendo,United States,Louisville,Kentucky,South,$15.85,35.82,4.755,10.746
3,4,Wii Sports Resort,Wii,2010,Apr,Sports,Nintendo,United States,Round Rock,Texas,Central,$15.75,33.00,4.725,9.900
4,5,Pokemon Red/Pokemon Blue,GB,2010,May,Role-Playing,Nintendo,United States,Nashville,Tennessee,South,$11.27,31.37,3.381,9.411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5904,5890,Crazy Taxi: Catch a Ride,GBA,2017,Sep,Racing,THQ,Australia,Sydney,New South Wales,West,0.21,0.30,0.063,0.090
5905,5891,MySims Party,DS,2017,Oct,Simulation,Electronic Arts,Australia,Sydney,New South Wales,West,0.15,0.30,0.045,0.090
5906,5892,Harry Potter and the Order of the Phoenix,X360,2017,Nov,Action,Electronic Arts,Australia,Sydney,New South Wales,West,0.24,0.30,0.072,0.090
5907,5893,Skylanders: SuperChargers,PS4,2017,Dec,Action,Activision,Australia,Sydney,New South Wales,West,0.17,0.30,0.051,0.090


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5909 entries, 0 to 5908
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rank           5909 non-null   int64  
 1   Name           5909 non-null   object 
 2   Platform       5909 non-null   object 
 3   Year           5909 non-null   int64  
 4   Month          5909 non-null   object 
 5   Genre          5909 non-null   object 
 6   Publisher      5897 non-null   object 
 7   Country        5909 non-null   object 
 8   City           5909 non-null   object 
 9   State          5909 non-null   object 
 10  Region         5882 non-null   object 
 11  NA_Sales       5909 non-null   object 
 12  Global_Sales   5909 non-null   float64
 13  NA_Profit      5909 non-null   float64
 14  Global_Profit  5909 non-null   float64
dtypes: float64(3), int64(2), object(10)
memory usage: 692.6+ KB


# Cleaning And Transforming Data

In [5]:
# REMOVING DUPLICATES
df[df.duplicated()].count()

Rank             16
Name             16
Platform         16
Year             16
Month            16
Genre            16
Publisher        16
Country          16
City             16
State            16
Region           16
NA_Sales         16
Global_Sales     16
NA_Profit        16
Global_Profit    16
dtype: int64

In [6]:
df=df.drop_duplicates()
df[df.duplicated()].count()

Rank             0
Name             0
Platform         0
Year             0
Month            0
Genre            0
Publisher        0
Country          0
City             0
State            0
Region           0
NA_Sales         0
Global_Sales     0
NA_Profit        0
Global_Profit    0
dtype: int64

**Removing Null Values**

In [18]:
df.isnull().sum()

Rank              0
Name              0
Platform          0
Year              0
Month             0
Genre             0
Publisher        12
Country           0
City              0
State             0
Region           27
NA_Sales          0
Global_Sales      0
NA_Profit         0
Global_Profit     0
dtype: int64

In [19]:
# Filling null with va;ues 
df['Region'].fillna('North',inplace=True)
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Region'].fillna('North',inplace=True)


Rank              0
Name              0
Platform          0
Year              0
Month             0
Genre             0
Publisher        12
Country           0
City              0
State             0
Region            0
NA_Sales          0
Global_Sales      0
NA_Profit         0
Global_Profit     0
dtype: int64

In [23]:
# dropping null values
df=df.dropna()
df.isnull().sum()

Rank             0
Name             0
Platform         0
Year             0
Month            0
Genre            0
Publisher        0
Country          0
City             0
State            0
Region           0
NA_Sales         0
Global_Sales     0
NA_Profit        0
Global_Profit    0
dtype: int64

**Spellings Checks**

In [24]:
df.loc[:,'Country']=df['Country'].replace({'United State':'United States'})
df['Country'].head(15)

0     United States
1     United States
2     United States
3     United States
4     United States
5               USA
6               USA
7               USA
8               USA
9               USA
10              USA
11              USA
12              USA
13              USA
14              USA
Name: Country, dtype: object

In [25]:
df.loc[:,'Country']=df['Country'].replace({'USA':'United States'})
df['Country'].head(15)

0     United States
1     United States
2     United States
3     United States
4     United States
5     United States
6     United States
7     United States
8     United States
9     United States
10    United States
11    United States
12    United States
13    United States
14    United States
Name: Country, dtype: object

**Formating Data Types**

In [26]:
df.dtypes

Rank               int64
Name              object
Platform          object
Year               int64
Month             object
Genre             object
Publisher         object
Country           object
City              object
State             object
Region            object
NA_Sales          object
Global_Sales     float64
NA_Profit        float64
Global_Profit    float64
dtype: object

In [29]:
df.loc[:,'NA_Sales']=pd.to_numeric(df['NA_Sales'],errors='coerce')

In [50]:
df.loc[:,'Month']=pd.to_datetime(df['Month'],format='%b',errors='coerce')

In [51]:
df.dtypes

Rank                      int64
Name                     object
Platform                 object
Year             datetime64[ns]
Month            datetime64[ns]
Genre                    object
Publisher                object
Country                  object
City                     object
State                    object
Region                   object
NA_Sales                float64
Global_Sales            float64
NA_Profit               float64
Global_Profit           float64
Date             datetime64[ns]
dtype: object