In [15]:
!python -V

Python 3.10.6


In [16]:
%pip install numpy pandas matplotlib





[notice] A new release of pip available: 22.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
# To start using pandas, first import the library to your chosen environment

import pandas as pd
import numpy as np

## Numpy

In [18]:
# The Numpy array is fixed size, unlike python lists

ListA = [0, 2, 3, 6, 1, 8, 7]
arr = np.array(ListA)
arr

array([0, 2, 3, 6, 1, 8, 7])

In [19]:
print("Dimension", arr.ndim)
print("Shape", arr.shape)
print("Element Type", arr.dtype)

Dimension 1
Shape (7,)
Element Type int64


In [20]:
# create a 2d numpy array from multiple lists

ListA = [0, 1, 2, 3]
ListB = [4, 5, 6, 7]
ListC = [8, 9, 10, 11]

array_2d = np.array([ListA, ListB, ListC])
array_2d

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [21]:
print("Dimension", array_2d.ndim)
print("Shape", array_2d.shape)
print("Element Type", array_2d.dtype)

Dimension 2
Shape (3, 4)
Element Type int64


## NumPy basic indexing and selecting

In [22]:
# NumPy indexing goes (col, row) like python, unlike pandas

ListA = [0, 2, 3, 6, 1, 8, 7]
arr = np.array(ListA)

# grab elements before index 2
ListA[:2]


[0, 2]

In [23]:
# grab elements between index 4 and 8
ListA[4:8]

[1, 8, 7]

In [24]:
# Since NumPy arrays are numerical, there is no option for label selecting
# You do not need iloc for row/col selection

ListA = [0, 1, 2, 3]
ListB = [4, 5, 6, 7]
ListC = [8, 9, 10, 11]

array_2d = np.array([ListA, ListB, ListC])

# Grab all rows from the first two columns
array_2d[:, [0, 2]]

array([[ 0,  2],
       [ 4,  6],
       [ 8, 10]])

In [25]:
# Grab all rows from columns index 0, 2, and 3
array_2d[:, 0:2:3]

array([[0],
       [4],
       [8]])

In [26]:
# You can assign a specific value to a specific array location

# assign row 0, column 2 to value 5
array_2d[0, 2] = 5
array_2d

array([[ 0,  1,  5,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [27]:
# assign all of row 0 to 5
array_2d[0, :] = 5
array_2d

array([[ 5,  5,  5,  5],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [28]:
# Should I do 3d numpy arrays?

## Numpy functions

In [29]:
# you can create an array full of zeros with np.zeros

arr = np.zeros((2, 5))
print(arr)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [30]:
# You can create an array full of ones with np.ones

arr = np.ones((2, 5))
print(arr)

[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]]


In [157]:
# You can fill a numpy array with a specific value using np.full

arr = np.full((2, 5), 100)
print(arr)

[[100 100 100 100 100]
 [100 100 100 100 100]]


In [160]:
# You can fill a numpy array with random integers using np.random.randint

# Create an array of size (2, 5) with integers from 0 to 10
arr = np.random.randint(10, size=(2, 5))
print(arr)

[[1 9 1 4 2]
 [1 8 6 7 0]]


In [164]:
# Create an array of size (2, 5) with random numbers between 0 and 1
arr = np.random.rand(2, 5)
print(arr)

[[0.19153662 0.42027364 0.27335521 0.38815598 0.91180124]
 [0.93976811 0.00173762 0.81648257 0.87982431 0.14886937]]


In [165]:
# use np.copy() to copy one numpy array onto another

arr2 = np.copy(arr)
arr2

array([[0.19153662, 0.42027364, 0.27335521, 0.38815598, 0.91180124],
       [0.93976811, 0.00173762, 0.81648257, 0.87982431, 0.14886937]])

In [None]:
# Mini Challenge! :)
# Using what you've learned create the NumPy array shown on the board

# (NUMPY CHALLENGE IN WORKSHOP DAY 1 PPT)

# Pandas

In [None]:
# To start working with pandas, you typically import your data from a csv file to a dataFrame

df = pd.read_csv("filepath")

In [31]:
# Converting a dataframe into a NumPy array


pokemon_df = pd.read_csv('pokemon.csv')
data = pokemon_df.to_numpy()
print(f'{data} \n {type(data)}')

[[1 'Bulbasaur' 'Grass' ... 45 1 False]
 [2 'Ivysaur' 'Grass' ... 60 1 False]
 [3 'Venusaur' 'Grass' ... 80 1 False]
 ...
 [720 'HoopaHoopa Confined' 'Psychic' ... 70 6 True]
 [720 'HoopaHoopa Unbound' 'Psychic' ... 80 6 True]
 [721 'Volcanion' 'Fire' ... 70 6 True]] 
 <class 'numpy.ndarray'>


## Data Structures

##### The two main data structures used in pandas are Series, and DataFrames

In [9]:
# A series is a 1-Dimensional array used to store any data type

# Numeric Data
Data = [1, 3, 4, 5, 6, 2, 9]

# Predefined index values
Index = ['a', 'b', 'c', 'd', 'e', 'f', 'g']

# Creating a series with default index values
s = pd.Series(Data)

# Creadting a series with predefined index values
si = pd.Series(Data, Index)

In [None]:
print(s)

0    1
1    3
2    4
3    5
4    6
5    2
6    9
dtype: int64


In [None]:
print(si)

a    1
b    3
c    4
d    5
e    6
f    2
g    9
dtype: int64


In [10]:
# A DataFrame is a 2D representation of a set of data

data = {
    'Name': ['Sara', 'Andrew', 'Jack', 'Vanamala'],
    'Age': [21, 19, 23, 28]
}

# Create DataFrame
df = pd.DataFrame(data)

print(df)

       Name  Age
0      Sara   21
1    Andrew   19
2      Jack   23
3  Vanamala   28


In [None]:
## Mini Challenge ! :)

# Create a DataFrame with your own predifined columns, index, and data
# Use at least 3 different columns

## Indexing and Selecting Data

In [32]:
# Example Baseball player dataset

cat_owners_data = {
    'John': ['Garfield', 30, 7, 5],
    'Jack': ['Nermal', 5, 1, 9], 
    'Andrew': ['Socks', 9, 3, 8],
    'Vanamala': ['Destroyer', 8, 6, 3]
}

cat_owners_index = ['Cat Name', 'Weight', 'Age', 'Lives']

cat_owners_df = pd.DataFrame(cat_owners_data, cat_owners_index)

cat_owners_df

Unnamed: 0,John,Jack,Andrew,Vanamala
Cat Name,Garfield,Nermal,Socks,Destroyer
Weight,30,5,9,8
Age,7,1,3,6
Lives,5,9,8,3


In [None]:
# methods for indexing and selecting data from DataFrames are .loc() and .iloc()
# .iloc() selects elements using integer labels 

# grabs elements from the second indexed row
cat_owners_df.iloc[1]

John        30
Jack         5
Andrew       9
Vanamala     8
Name: Weight, dtype: object

In [None]:
# grabs elements from multiple rows
cat_owners_df.iloc[[0, 2, 3]]

Unnamed: 0,John,Jack,Andrew,Vanamala
Cat Name,Garfield,Nermal,Socks,Destroyer
Age,7,1,3,6
Lives,5,9,8,3


In [None]:
# grab elements from the middle two rows, duration and hits
cat_owners_df.iloc[1:3]

Unnamed: 0,John,Jack,Andrew,Vanamala
Weight,30,5,9,8
Age,7,1,3,6


In [None]:
# grab elements from the last two rows, and the first two columns
cat_owners_df.iloc[[2, 3], [0, 1]]

Unnamed: 0,John,Jack
Age,7,1
Lives,5,9


In [None]:
# .loc grabs elements using labels
# unlike python arrays, pandas 2-d indexing goes df[rows, cols] instead of [col, rows]

# grab data only from the row 'misses'
cat_owners_df.loc['Cat Name']

John         Garfield
Jack           Nermal
Andrew          Socks
Vanamala    Destroyer
Name: Cat Name, dtype: object

In [None]:
# grab data from multiple rows
cat_owners_df.loc[['Cat Name', 'Lives']]

Unnamed: 0,John,Jack,Andrew,Vanamala
Cat Name,Garfield,Nermal,Socks,Destroyer
Lives,5,9,8,3


In [None]:
# grabs all rows from column Vanamala

cat_owners_df.loc[:, ['Vanamala']]

Unnamed: 0,Vanamala
Cat Name,Destroyer
Weight,8
Age,6
Lives,3


In [None]:
# grab rows calories and duration from column P2 and p3

cat_owners_df.loc[['Age', 'Lives'], ['Jack', 'Vanamala']]

Unnamed: 0,Jack,Vanamala
Age,1,6
Lives,9,3


In [None]:
## Mini Challenge ! :)

# Write code using iloc() and .loc() to grab the Cat Name and Cat Age rows from owner John

## Viewing Data

In [None]:
# .head() displays the first x rows of your data

cat_owners_df.head()

Unnamed: 0,John,Jack,Andrew,Vanamala
Cat Name,Garfield,Nermal,Socks,Destroyer
Weight,30,5,9,8
Age,7,1,3,6
Lives,5,9,8,3


In [None]:
# using a number parameter specifies how many rows you want to display
cat_owners_df.head(2)

Unnamed: 0,John,Jack,Andrew,Vanamala
Cat Name,Garfield,Nermal,Socks,Destroyer
Weight,30,5,9,8


In [None]:
# .tail() displays the first x rows of your data

cat_owners_df.tail()

Unnamed: 0,John,Jack,Andrew,Vanamala
Cat Name,Garfield,Nermal,Socks,Destroyer
Weight,30,5,9,8
Age,7,1,3,6
Lives,5,9,8,3


In [None]:
cat_owners_df.tail(3)

Unnamed: 0,John,Jack,Andrew,Vanamala
Weight,30,5,9,8
Age,7,1,3,6
Lives,5,9,8,3


In [None]:
# use .describe() to see statistical descriptions of your data quickly
cat_owners_df.describe()

Unnamed: 0,John,Jack,Andrew,Vanamala
count,4,4,4,4
unique,4,4,4,4
top,Garfield,Nermal,Socks,Destroyer
freq,1,1,1,1


In [None]:
# use .columns to display your column labels
cat_owners_df.columns

Index(['John', 'Jack', 'Andrew', 'Vanamala'], dtype='object')

In [None]:
# use .index to display your index labels
cat_owners_df.index

Index(['Cat Name', 'Weight', 'Age', 'Lives'], dtype='object')

In [None]:
## Mini Challenge ! :)

# grab the last two rows of the df using .tail()

## Operations

In [None]:
# Example Ice cream dataset

icecream_order_data = {
        'Jonathan': ['Chocolate', 20],
        'Evan': ['Mint Chip', 13], 
        'Audrey': ['Chocolate', 18],
        'Olivia': ['Vanilla', 10],
        'Ethan': ['Mint Chip', 29],
        'Jasmine': ['Chocolate', 23],
        'Zack': ['Superman', 5]  
}

icecream_order_index = ['Flavor', 'Age']

icecream_order_data = {
        'Flavor': ['Chocolate', 'Mint Chip', 'Chocolate', 'Vanilla', 'Mint Chip', 'Chocolate', 'Superman'],
        'Age': [20, 13, 18, 10, 29, 23, 5]
}

icecream_order_index = ['Jonathan', 'Evan', 'Audrey', 'Olivia', 'Ethan', 'Jasmine', 'Zack']

icecream_order_df = pd.DataFrame(icecream_order_data, icecream_order_index)

icecream_order_df

Unnamed: 0,Flavor,Age
Jonathan,Chocolate,20
Evan,Mint Chip,13
Audrey,Chocolate,18
Olivia,Vanilla,10
Ethan,Mint Chip,29
Jasmine,Chocolate,23
Zack,Superman,5


In [None]:
# .mean() displays the mean value of your selected data
mean = icecream_order_df['Age'].mean()
print(mean)

16.857142857142858


In [None]:
# .sum() displays the sum of your selected data
sum = icecream_order_df['Age'].sum()
print(sum)

118


In [None]:
# .value_counts counts the number of times a unique value appears
icecream_order_df['Flavor'].value_counts()

Flavor
Chocolate    3
Mint Chip    2
Vanilla      1
Superman     1
Name: count, dtype: int64

In [None]:
# .apply() will take a function and apply it to your data.
# This can be a NumPy function

icecream_order_df.loc[:, ['Age']].apply(np.sum)

Age    118
dtype: int64

In [None]:
## Mini Challenge ! :)

# use the .mean() function with your previously created DataFrame

## Manipulating Data

In [None]:
# For this section, we will be using the cat owners Dataset

cat_owners_df

Unnamed: 0,John,Jack,Andrew,Vanamala
Cat Name,Garfield,Nermal,Socks,Destroyer
Weight,30,5,9,8
Age,7,1,3,6
Lives,5,9,8,3


In [None]:
# Manually add a new column to the DataFrame

cat_info = ['Mittens', 7, 5, 9]
cat_owners_df['Dr.Fine'] = cat_info

cat_owners_df

Unnamed: 0,John,Jack,Andrew,Vanamala,Dr.Fine
Cat Name,Garfield,Nermal,Socks,Destroyer,Mittens
Weight,30,5,9,8,7
Age,7,1,3,6,5
Lives,5,9,8,3,9


In [None]:
# To insert a list as a new column in a dataframe you can use the .insert() function.
# .insert(loc, column, value) 

cat_info = ['Salem', '5', '10', '1']

cat_owners_df.insert(5, 'Sabrina', cat_info)

cat_owners_df

Unnamed: 0,John,Jack,Andrew,Vanamala,Dr.Fine,Sabrina
Cat Name,Garfield,Nermal,Socks,Destroyer,Mittens,Salem
Weight,30,5,9,8,7,5
Age,7,1,3,6,5,10
Lives,5,9,8,3,9,1


In [None]:
# Insert new column into the 3rd index 

cat_info = ['Pickles', '2', '6', '9']

cat_owners_df.insert(3, 'Sara', cat_info)

cat_owners_df

Unnamed: 0,John,Jack,Andrew,Sara,Vanamala,Dr.Fine,Sabrina
Cat Name,Garfield,Nermal,Socks,Pickles,Destroyer,Mittens,Salem
Weight,30,5,9,2,8,7,5
Age,7,1,3,6,6,5,10
Lives,5,9,8,9,3,9,1


In [None]:
# To delete a row/column, use the .drop() function
# .drop(label, axis, inplace)
# axis specifies 1=columns, 0=rows
# inplace determines if the original dataframe is altered

cat_owners_df.drop('Sabrina', axis=1, inplace=True)

cat_owners_df

Unnamed: 0,John,Jack,Andrew,Sara,Vanamala,Dr.Fine
Cat Name,Garfield,Nermal,Socks,Pickles,Destroyer,Mittens
Weight,30,5,9,2,8,7
Age,7,1,3,6,6,5
Lives,5,9,8,9,3,9


In [None]:
# Create a new dataFrame where the row "Lives" is dropped

cat_owners_df_2 = cat_owners_df.drop('Lives', axis=0, inplace=False)

cat_owners_df_2

Unnamed: 0,John,Jack,Andrew,Sara,Vanamala,Dr.Fine
Cat Name,Garfield,Nermal,Socks,Pickles,Destroyer,Mittens
Weight,30,5,9,2,8,7
Age,7,1,3,6,6,5


In [None]:
## Mini Challenge ! :)

# Using your created DataFrame, use .insert() to add a new column, and .drop() to remove a row

## Merging Data

In [None]:
# Example employee data
data1 = {
    'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
    'Age': [27, 24, 22, 32],
    'Address': ['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
    'Qualification': ['Msc', 'MA', 'MCA', 'Phd']
}
    

data2 = {
    'Name': ['Abhi', 'Ayushi', 'Dhiraj', 'Hitesh'],
    'Age': [17, 14, 12, 52],
    'Address': ['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
    'Qualification': ['Btech', 'B.A', 'Bcom', 'B.hons']
}

df1 = pd.DataFrame(data1, index=[0, 1, 2, 3])

df2 = pd.DataFrame(data2, index=[4, 5, 6, 7])

df1

Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Nagpur,Msc
1,Princi,24,Kanpur,MA
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannuaj,Phd


In [None]:
# merge using the .concat() function

frames = [df1, df2]

df3 = pd.concat(frames)

df3

Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Nagpur,Msc
1,Princi,24,Kanpur,MA
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannuaj,Phd
4,Abhi,17,Nagpur,Btech
5,Ayushi,14,Kanpur,B.A
6,Dhiraj,12,Allahabad,Bcom
7,Hitesh,52,Kannuaj,B.hons


In [None]:
# concat two dataframes and group

df3 = pd.concat(frames, keys=['DF1', 'DF2'])

df3

Unnamed: 0,Unnamed: 1,Name,Age,Address,Qualification
DF1,0,Jai,27,Nagpur,Msc
DF1,1,Princi,24,Kanpur,MA
DF1,2,Gaurav,22,Allahabad,MCA
DF1,3,Anuj,32,Kannuaj,Phd
DF2,4,Abhi,17,Nagpur,Btech
DF2,5,Ayushi,14,Kanpur,B.A
DF2,6,Dhiraj,12,Allahabad,Bcom
DF2,7,Hitesh,52,Kannuaj,B.hons


In [None]:
# Example dataframes

data1 = {
    'key': ['K0', 'K1', 'K2', 'K3'],
    'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
    'Age':[27, 24, 22, 32],} 
   
data2 = {
    'key': ['K0', 'K1', 'K2', 'K3'],
    'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
    'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']
} 

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

In [None]:
# use the .merge function to merge two dataframes based on a key

df3 = pd.merge(df2, df2, on='key')

df3

Unnamed: 0,key,Address_x,Qualification_x,Address_y,Qualification_y
0,K0,Nagpur,Btech,Nagpur,Btech
1,K1,Kanpur,B.A,Kanpur,B.A
2,K2,Allahabad,Bcom,Allahabad,Bcom
3,K3,Kannuaj,B.hons,Kannuaj,B.hons


## Intermediate pandas with Pokemon Dataset

In [33]:
# importing csv data

pokemon_df = pd.read_csv('pokemon.csv')

pokemon_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [None]:
# you can sort data using .sort_values() function

# Sort the DataFrame by Pokemon Name in alphabetical order
pokemon_df.sort_values(by='Name')

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
510,460,Abomasnow,Grass,Ice,494,90,92,75,92,85,60,4,False
511,460,AbomasnowMega Abomasnow,Grass,Ice,594,90,132,105,132,105,30,4,False
68,63,Abra,Psychic,,310,25,20,15,105,55,90,1,False
392,359,Absol,Dark,,465,65,130,60,75,60,75,3,False
393,359,AbsolMega Absol,Dark,,565,65,150,60,115,60,115,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,571,Zoroark,Dark,,510,60,105,60,120,60,105,5,False
631,570,Zorua,Dark,,330,40,65,40,80,40,65,5,False
46,41,Zubat,Poison,Flying,245,40,45,35,30,40,55,1,False
695,634,Zweilous,Dark,Dragon,420,72,85,70,65,70,58,5,False


In [None]:
# Sort the DataFrame by Pokemon Name in reverse alphabetical order
# using the ascending = False parameter
pokemon_df.sort_values(by='Name', ascending=False)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
794,718,Zygarde50% Forme,Dragon,Ground,600,108,100,121,81,95,95,6,True
695,634,Zweilous,Dark,Dragon,420,72,85,70,65,70,58,5,False
46,41,Zubat,Poison,Flying,245,40,45,35,30,40,55,1,False
631,570,Zorua,Dark,,330,40,65,40,80,40,65,5,False
632,571,Zoroark,Dark,,510,60,105,60,120,60,105,5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,359,AbsolMega Absol,Dark,,565,65,150,60,115,60,115,3,False
392,359,Absol,Dark,,465,65,130,60,75,60,75,3,False
68,63,Abra,Psychic,,310,25,20,15,105,55,90,1,False
511,460,AbomasnowMega Abomasnow,Grass,Ice,594,90,132,105,132,105,30,4,False


In [None]:
# Sort the DataFrame numerically using Pokemon HP
pokemon_df.sort_values(by='HP', ascending=False)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
261,242,Blissey,Normal,,540,255,10,10,75,135,55,2,False
121,113,Chansey,Normal,,450,250,5,5,35,105,50,1,False
217,202,Wobbuffet,Psychic,,405,190,33,58,33,58,33,2,False
351,321,Wailord,Water,,500,170,90,45,90,45,60,3,False
655,594,Alomomola,Water,,470,165,75,80,40,45,65,5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,129,Magikarp,Water,,200,20,10,55,15,20,80,1,False
381,349,Feebas,Water,,200,20,15,20,10,55,80,3,False
388,355,Duskull,Ghost,,295,20,40,90,30,90,25,3,False
55,50,Diglett,Ground,,265,10,55,25,35,45,95,1,False


In [None]:
# You can also sort using multiple values
pokemon_df.sort_values(by=['Type 1', 'HP'], ascending=False)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
351,321,Wailord,Water,,500,170,90,45,90,45,60,3,False
655,594,Alomomola,Water,,470,165,75,80,40,45,65,5,False
142,131,Lapras,Water,Ice,535,130,85,80,85,95,60,1,False
145,134,Vaporeon,Water,,525,130,65,60,110,95,65,1,False
350,320,Wailmer,Water,,400,130,70,35,70,35,60,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
314,290,Nincada,Bug,Ground,266,31,45,90,30,30,40,3,False
462,415,Combee,Bug,Flying,244,30,30,42,30,42,70,4,False
603,543,Venipede,Bug,Poison,260,30,45,59,30,39,57,5,False
230,213,Shuckle,Bug,Rock,505,20,10,230,10,230,5,2,False


In [None]:
# Create a column totaling the sum of the major stats
pokemon_df['Total_Atk_Dfn'] = pokemon_df['Attack'] + pokemon_df['Defense']
pokemon_df

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total_Atk_Dfn
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,98
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,125
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,165
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,223
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True,250
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True,270
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True,170
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True,220


In [None]:
# Create a column totaling the sum of the major stats using iloc
pokemon_df['Total_Atk_Dfn'] = pokemon_df.iloc[:, 6:8].sum(axis=1)
pokemon_df

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total_Atk_Dfn
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,98
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,125
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,165
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,223
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True,250
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True,270
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True,170
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True,220


In [None]:
# Select based on one conditions using loc

# Select all pokemon with type 1 = Grass
pokemon_df.loc[pokemon_df['Type 1'] == 'Grass']

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total_Atk_Dfn
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,98
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,125
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,165
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,223
48,43,Oddish,Grass,Poison,320,45,50,55,75,65,30,1,False,105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
718,650,Chespin,Grass,,313,56,61,65,48,45,38,6,False,126
719,651,Quilladin,Grass,,405,61,78,95,56,58,57,6,False,173
720,652,Chesnaught,Grass,Fighting,530,88,107,122,74,75,64,6,False,229
740,672,Skiddo,Grass,,350,66,65,48,62,57,52,6,False,113


In [None]:
# Select based on Multiple conditions using loc

# Select all pokemon with type 1 = Grass and Type 2 = Poison
pokemon_df.loc[(pokemon_df['Type 1'] == 'Grass') & (pokemon_df['Type 2'] == 'Poison')]

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total_Atk_Dfn
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,98
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,125
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,165
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,223
48,43,Oddish,Grass,Poison,320,45,50,55,75,65,30,1,False,105
49,44,Gloom,Grass,Poison,395,60,65,70,85,75,40,1,False,135
50,45,Vileplume,Grass,Poison,490,75,80,85,110,90,50,1,False,165
75,69,Bellsprout,Grass,Poison,300,50,75,35,70,30,40,1,False,110
76,70,Weepinbell,Grass,Poison,390,65,90,50,85,45,55,1,False,140
77,71,Victreebel,Grass,Poison,490,80,105,65,100,70,70,1,False,170


In [None]:
# Select all pokemon with type 1 = Grass and Type 2 = Poison and HP > 60
pokemon_df.loc[(pokemon_df['Type 1'] == 'Grass') & (pokemon_df['Type 2'] == 'Poison') & (pokemon_df['HP'] > 60)]

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total_Atk_Dfn
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,165
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,223
50,45,Vileplume,Grass,Poison,490,75,80,85,110,90,50,1,False,165
76,70,Weepinbell,Grass,Poison,390,65,90,50,85,45,55,1,False,140
77,71,Victreebel,Grass,Poison,490,80,105,65,100,70,70,1,False,170
651,590,Foongus,Grass,Poison,294,69,55,45,55,55,15,5,False,100
652,591,Amoonguss,Grass,Poison,464,114,85,70,85,80,30,5,False,155


In [None]:
# the .contains() function can be used to select data with specific strings

# Select all Mega pokemon
pokemon_df.loc[pokemon_df['Name'].str.contains('Mega')].head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total_Atk_Dfn
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,223
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False,241
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False,182
12,9,BlastoiseMega Blastoise,Water,,630,79,103,120,135,115,78,1,False,223
19,15,BeedrillMega Beedrill,Bug,Poison,495,65,150,40,15,80,145,1,False,190


In [None]:
# the ~ can be used to select the opposite

# select all non-Mega pokemon
pokemon_df.loc[~pokemon_df['Name'].str.contains('Mega')].head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total_Atk_Dfn
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,98
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,125
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,165
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False,95
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False,122


## Visualizing Data

In [None]:
# Do we need this rn? It would be simple matplot stuff