In [1]:
import pandas as pd
import numpy as np

In [2]:
vg_df = pd.read_csv('vgsales.csv', encoding='utf')

In [3]:
vg_df.shape

(16598, 11)

In [4]:
vg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


In [5]:
vg_df[['Name','Platform','Year','Genre','Publisher']].iloc[6:20]

Unnamed: 0,Name,Platform,Year,Genre,Publisher
6,New Super Mario Bros.,DS,2006.0,Platform,Nintendo
7,Wii Play,Wii,2006.0,Misc,Nintendo
8,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo
9,Duck Hunt,NES,1984.0,Shooter,Nintendo
10,Nintendogs,DS,2005.0,Simulation,Nintendo
11,Mario Kart DS,DS,2005.0,Racing,Nintendo
12,Pokemon Gold/Pokemon Silver,GB,1999.0,Role-Playing,Nintendo
13,Wii Fit,Wii,2007.0,Sports,Nintendo
14,Wii Fit Plus,Wii,2009.0,Sports,Nintendo
15,Kinect Adventures!,X360,2010.0,Misc,Microsoft Game Studios


<center> Dataset for video games sales </center>

#### The Genre attribute above is a nominal category attribute. So lets get the list of unique video games genre.

In [6]:
genres = np.unique(vg_df['Genre']); genres

array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
       'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
       'Strategy'], dtype=object)

#### This shows that there are 12 distinct categories for video games genres. We can now generate a label encoding scheme for mapping each category to a numeric value by leveraging scikit-learn. So with the help of the LabelEncoder a mapping scheme is generated where each genre value is assigned a number.

In [7]:
from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()

genre_labels = gle.fit_transform(vg_df['Genre'])

genre_mappings = {index: label for index, label in enumerate(gle.classes_)}; genre_mappings

{0: 'Action',
 1: 'Adventure',
 2: 'Fighting',
 3: 'Misc',
 4: 'Platform',
 5: 'Puzzle',
 6: 'Racing',
 7: 'Role-Playing',
 8: 'Shooter',
 9: 'Simulation',
 10: 'Sports',
 11: 'Strategy'}

In [8]:
# Writing back the column genre_labels back to the dataframe

vg_df['GenreLabel'] = genre_labels

vg_df[['Name','Platform','Year','Genre','GenreLabel','Publisher']].iloc[6:20]

Unnamed: 0,Name,Platform,Year,Genre,GenreLabel,Publisher
6,New Super Mario Bros.,DS,2006.0,Platform,4,Nintendo
7,Wii Play,Wii,2006.0,Misc,3,Nintendo
8,New Super Mario Bros. Wii,Wii,2009.0,Platform,4,Nintendo
9,Duck Hunt,NES,1984.0,Shooter,8,Nintendo
10,Nintendogs,DS,2005.0,Simulation,9,Nintendo
11,Mario Kart DS,DS,2005.0,Racing,6,Nintendo
12,Pokemon Gold/Pokemon Silver,GB,1999.0,Role-Playing,7,Nintendo
13,Wii Fit,Wii,2007.0,Sports,10,Nintendo
14,Wii Fit Plus,Wii,2009.0,Sports,10,Nintendo
15,Kinect Adventures!,X360,2010.0,Misc,3,Microsoft Game Studios


<center> Video game genres with their encoded labels </center>

In [9]:
poke_df = pd.read_csv('Pokemon.csv', encoding = 'utf-8');poke_df

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [12]:
poke_df = poke_df.sample(random_state=1, frac=1).reset_index(drop=True)

In [13]:
generation = np.unique(poke_df['Generation']); generation

array([1, 2, 3, 4, 5, 6])

In [15]:
gen_ord_map = {1: 'Gen 1', 2: 'Gen 2', 3 : 'Gen 3',
               4: 'Gen 4', 5: 'Gen 5', 6: 'Gen 6'}
poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
poke_df[['Name','Generation','GenerationLabel']].iloc[5:10]

Unnamed: 0,Name,Generation,GenerationLabel
5,Helioptile,6,Gen 6
6,Dialga,4,Gen 4
7,DeoxysDefense Forme,3,Gen 3
8,Rapidash,1,Gen 1
9,Swanna,5,Gen 5


In [16]:
# Lets take a subset of the Pokemon dataset depicting 2 attributes of interest
poke_df[['Name','GenerationLabel','Legendary']].iloc[4:10]

Unnamed: 0,Name,GenerationLabel,Legendary
4,Octillery,Gen 2,False
5,Helioptile,Gen 6,False
6,Dialga,Gen 4,True
7,DeoxysDefense Forme,Gen 3,True
8,Rapidash,Gen 1,False
9,Swanna,Gen 5,False


In [18]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# transform the map pokemon generations
gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(poke_df['GenerationLabel'])
poke_df['Gen_Label'] = gen_labels

poke_df_sub = poke_df[['Name','GenerationLabel','Gen_Label',
                      'Legendary']].iloc[4:10]; poke_df_sub

Unnamed: 0,Name,GenerationLabel,Gen_Label,Legendary
4,Octillery,Gen 2,1,False
5,Helioptile,Gen 6,5,False
6,Dialga,Gen 4,3,True
7,DeoxysDefense Forme,Gen 3,2,True
8,Rapidash,Gen 1,0,False
9,Swanna,Gen 5,4,False
