## Tutorial from https://www.youtube.com/watch?v=2zGl9FGpmrA&ab_channel=NodCodingBootcamp

In [1]:
# Import the libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Get dataframe
df = pd.read_csv('https://raw.githubusercontent.com/cajjster/data_files/main/vgsales.csv')
df.head()

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global Sales
0,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


DATAFRAME VS SERIES

In [3]:
dataframe = type(df) #dataframe
series = type(df["Name"]) #Series 
print(dataframe, series)

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


## INSPECTING DATA IN PANDAS 

In [4]:
df.info()
# object type is same as string

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16291 entries, 0 to 16290
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          16291 non-null  object 
 1   Platform      16291 non-null  object 
 2   Year          16291 non-null  int64  
 3   Genre         16291 non-null  object 
 4   Publisher     16291 non-null  object 
 5   NA_Sales      16291 non-null  float64
 6   EU_Sales      16291 non-null  float64
 7   JP_Sales      16291 non-null  float64
 8   Other_Sales   16291 non-null  float64
 9   Global Sales  16291 non-null  float64
dtypes: float64(5), int64(1), object(4)
memory usage: 1.2+ MB


In [5]:
df.describe()
# gives summary statistics of the dataframe

Unnamed: 0,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global Sales
count,16291.0,16291.0,16291.0,16291.0,16291.0,16291.0
mean,2006.405561,0.265647,0.147731,0.078833,0.048426,0.54091
std,5.832412,0.822432,0.509303,0.311879,0.190083,1.567345
min,1980.0,0.0,0.0,0.0,0.0,0.01
25%,2003.0,0.0,0.0,0.0,0.0,0.06
50%,2007.0,0.08,0.02,0.0,0.01,0.17
75%,2010.0,0.24,0.11,0.04,0.04,0.48
max,2020.0,41.49,29.02,10.22,10.57,82.74


In [18]:
df.describe(include="O")
# include Object type
# Gives summary statistics of the object type   

Unnamed: 0,Name,Platform,Genre,Publisher
count,16291,16291,16291,16291
unique,11325,31,12,576
top,Need for Speed: Most Wanted,DS,Action,Electronic Arts
freq,12,2131,3251,1339


## SELECTING COLUMNS IN PANDAS 

In [7]:
df.columns
# shows all the columns in the dataframe

Index(['Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global Sales'],
      dtype='object')

In [8]:
# Selecting single column
df["Genre"]

0              Sports
1            Platform
2              Racing
3              Sports
4        Role-Playing
             ...     
16286        Platform
16287         Shooter
16288          Racing
16289          Puzzle
16290        Platform
Name: Genre, Length: 16291, dtype: object

In [9]:
# Selecting multiple columns
# passing a list of strings to select multiple columns [[ ,  ,  ,  ,]]
df[["Name", "Platform", "Year"]]

Unnamed: 0,Name,Platform,Year
0,Wii Sports,Wii,2006
1,Super Mario Bros.,NES,1985
2,Mario Kart Wii,Wii,2008
3,Wii Sports Resort,Wii,2009
4,Pokemon Red/Pokemon Blue,GB,1996
...,...,...,...
16286,Woody Woodpecker in Crazy Castle 5,GBA,2002
16287,Men in Black II: Alien Escape,GC,2003
16288,SCORE International Baja 1000: The Official Game,PS2,2008
16289,Know How 2,DS,2010


In [10]:
# Get numerical data in dataframe
df._get_numeric_data()

Unnamed: 0,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global Sales
0,2006,41.49,29.02,3.77,8.46,82.74
1,1985,29.08,3.58,6.81,0.77,40.24
2,2008,15.85,12.88,3.79,3.31,35.82
3,2009,15.75,11.01,3.28,2.96,33.00
4,1996,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...
16286,2002,0.01,0.00,0.00,0.00,0.01
16287,2003,0.01,0.00,0.00,0.00,0.01
16288,2008,0.00,0.00,0.00,0.00,0.01
16289,2010,0.00,0.01,0.00,0.00,0.01


In [19]:
df.select_dtypes("object")
# float, int, number 

Unnamed: 0,Name,Platform,Genre,Publisher
0,Wii Sports,Wii,Sports,Nintendo
1,Super Mario Bros.,NES,Platform,Nintendo
2,Mario Kart Wii,Wii,Racing,Nintendo
3,Wii Sports Resort,Wii,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,Role-Playing,Nintendo
...,...,...,...,...
16286,Woody Woodpecker in Crazy Castle 5,GBA,Platform,Kemco
16287,Men in Black II: Alien Escape,GC,Shooter,Infogrames
16288,SCORE International Baja 1000: The Official Game,PS2,Racing,Activision
16289,Know How 2,DS,Puzzle,7G//AMES


In [20]:
df.select_dtypes(['object', "int"])

Unnamed: 0,Name,Platform,Year,Genre,Publisher
0,Wii Sports,Wii,2006,Sports,Nintendo
1,Super Mario Bros.,NES,1985,Platform,Nintendo
2,Mario Kart Wii,Wii,2008,Racing,Nintendo
3,Wii Sports Resort,Wii,2009,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo
...,...,...,...,...,...
16286,Woody Woodpecker in Crazy Castle 5,GBA,2002,Platform,Kemco
16287,Men in Black II: Alien Escape,GC,2003,Shooter,Infogrames
16288,SCORE International Baja 1000: The Official Game,PS2,2008,Racing,Activision
16289,Know How 2,DS,2010,Puzzle,7G//AMES


In [22]:
# Condition using list comprehension
df[[col for col in df.columns if df[col].nunique() < 50]]

Unnamed: 0,Platform,Year,Genre
0,Wii,2006,Sports
1,NES,1985,Platform
2,Wii,2008,Racing
3,Wii,2009,Sports
4,GB,1996,Role-Playing
...,...,...,...
16286,GBA,2002,Platform
16287,GC,2003,Shooter
16288,PS2,2008,Racing
16289,DS,2010,Puzzle


## METHODS VS ATTRIBUTES
### Method changes the shape of the dataframe
### Change the state of the dataframe by passing different arguments

In [23]:
df.shape
# there are 10 columns in the dataframe

(16291, 10)

In [24]:
df.drop(columns="Name")
# drop a column in dataframe

Unnamed: 0,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global Sales
0,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...
16286,GBA,2002,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16287,GC,2003,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16288,PS2,2008,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16289,DS,2010,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [25]:
df["Year"].value_counts()

Year
2009    1431
2008    1428
2010    1257
2007    1201
2011    1136
2006    1008
2005     936
2002     829
2003     775
2004     744
2012     655
2015     614
2014     580
2013     546
2001     482
1998     379
2000     349
2016     342
1999     338
1997     289
1996     263
1995     219
1994     121
1993      60
1981      46
1992      43
1991      41
1982      36
1986      21
1989      17
1983      17
1990      16
1987      16
1988      15
1985      14
1984      14
1980       9
2017       3
2020       1
Name: count, dtype: int64

# Intuition or Common Sense

In [26]:
year = df["Year"]
type(year)

pandas.core.series.Series

In [28]:
df.columns

Index(['Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global Sales'],
      dtype='object')

In [29]:
df.value_counts()

Name                                          Platform  Year  Genre      Publisher               NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global Sales
'98 Koshien                                   PS        1998  Sports     Magical Company         0.15      0.10      0.12      0.03         0.41            1
Rat Attack!                                   N64       2000  Puzzle     Mindscape               0.02      0.00      0.00      0.00         0.02            1
Rapala Pro Bass Fishing 2010                  X360      2010  Sports     Activision              0.32      0.05      0.00      0.03         0.40            1
Rapala Pro Fishing                            GBA       2004  Sports     Zoo Digital Publishing  0.04      0.02      0.00      0.00         0.06            1
                                              PS2       2004  Sports     Zoo Digital Publishing  0.18      0.14      0.00      0.05         0.36            1
                                                         

In [30]:
# DataFrame
df["Genre"].value_counts()

Genre
Action          3251
Sports          2304
Misc            1686
Role-Playing    1470
Shooter         1282
Adventure       1274
Racing          1225
Platform         875
Simulation       848
Fighting         836
Strategy         670
Puzzle           570
Name: count, dtype: int64

In [31]:
# Series sorting
year.sort_values()

257      1980
6211     1980
1948     1980
5282     1980
1746     1980
         ... 
5205     2016
14136    2017
16135    2017
15944    2017
5860     2020
Name: Year, Length: 16291, dtype: int64

In [33]:
# Dataframe sorting
# df.sort_values() returns TypeError as it is missing argument
# need to specify which column(series) to sort by in the dataframe
df.sort_values(by="Genre")

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global Sales
8145,Ringling Bros. and Barnum & Bailey: Circus Fri...,DS,2009,Action,Take-Two Interactive,0.16,0.00,0.00,0.01,0.17
4030,Madagascar: Operation Penguin,GBA,2005,Action,Activision,0.35,0.13,0.00,0.01,0.48
11414,Dangerous Ji-San to 1000-nin no Otomodachi Yok...,3DS,2012,Action,Namco Bandai Games,0.00,0.00,0.08,0.00,0.08
11412,Terraria,3DS,2016,Action,505 Games,0.00,0.03,0.04,0.00,0.08
11410,Auto Destruct,PS,1998,Action,Electronic Arts,0.04,0.03,0.00,0.01,0.08
...,...,...,...,...,...,...,...,...,...,...
15584,Elven Legacy,PC,2009,Strategy,Paradox Interactive,0.01,0.00,0.00,0.00,0.02
11780,Hearts of Iron III,PC,2009,Strategy,Paradox Interactive,0.01,0.04,0.00,0.01,0.07
5191,Battalion Wars 2,Wii,2007,Strategy,Nintendo,0.22,0.03,0.08,0.02,0.35
11830,Real Time Conflict: Shogun Empires,DS,2005,Strategy,Namco Bandai Games,0.06,0.00,0.00,0.00,0.07


# iloc & loc

In [36]:
# Selecting the index row using iloc
df.iloc[4]

Name            Pokemon Red/Pokemon Blue
Platform                              GB
Year                                1996
Genre                       Role-Playing
Publisher                       Nintendo
NA_Sales                           11.27
EU_Sales                            8.89
JP_Sales                           10.22
Other_Sales                          1.0
Global Sales                       31.37
Name: 4, dtype: object

In [35]:
# Using slicing to select rows of dataframe 
# [start:stop:step]
df.iloc[:5]

Unnamed: 0,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global Sales
0,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [37]:
# [start:stop:step , start:stop:step]
# [ rows, columns]
df.iloc[:, 2:5]

Unnamed: 0,Year,Genre,Publisher
0,2006,Sports,Nintendo
1,1985,Platform,Nintendo
2,2008,Racing,Nintendo
3,2009,Sports,Nintendo
4,1996,Role-Playing,Nintendo
...,...,...,...
16286,2002,Platform,Kemco
16287,2003,Shooter,Infogrames
16288,2008,Racing,Activision
16289,2010,Puzzle,7G//AMES
