# Section 5: Intro to Pandas



In [1]:
import pandas as pd
import numpy as np

## Pandas Series

Analysis of the Group 7 -> political group
- ordered sequence of elements, indexed
- looks a lot like list, but there are a ton of differences
    - have associated data type (float64)
    - underlying numpy array
- more similar to numpy array
    - can select elements as would in array
- can define index with strings
    - so actually looks more like a dictionary, but it is ordered


In [3]:
#this is a series in millions, storing population of the 7 countries
g7_pop = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523])
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
dtype: float64

In [4]:
#can give it a name
g7_pop.name = 'G7 Population in millions'
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: G7 Population in millions, dtype: float64

In [6]:
#float64
g7_pop.dtype

dtype('float64')

In [7]:
#gives you array of values, is a numpy array
g7_pop.values

array([ 35.467,  63.951,  80.94 ,  60.665, 127.061,  64.511, 318.523])

In [8]:
#selecting individual element
g7_pop[0]

35.467

In [9]:
#finding out index structure
g7_pop.index

RangeIndex(start=0, stop=7, step=1)

In [10]:
#restating index names
g7_pop.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]
g7_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [11]:
#can do all of this at once
pd.Series({
    'Canada': 35.467,
    'France': 63.951,
    'Germany': 80.94,
    'Italy': 60.665,
    'Japan': 127.061,
    'United Kingdom': 64.511,
    'United States': 318.523
}, name='G7 Population in millions')

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [12]:
#create series out of other series
pd.Series(g7_pop, index=['France', 'Germany', 'Italy', 'Spain'])

France     63.951
Germany    80.940
Italy      60.665
Spain         NaN
Name: G7 Population in millions, dtype: float64

### Indexing and Slicing

In [13]:
#same syntax as python dictionary to acess specific value
g7_pop['Canada']

35.467

In [14]:
#multiple elements at once
g7_pop[['Italy', 'France']]

Italy     60.665
France    63.951
Name: G7 Population in millions, dtype: float64

In [15]:
#can still look according to number, like the first element
g7_pop.iloc[0]

35.467

In [16]:
#or last element, can also have multiple indexes with numbers
g7_pop.iloc[-1]

318.523

In [18]:
#pandas slicing includes upper limit
g7_pop['Canada': 'Italy']

Canada     35.467
France     63.951
Germany    80.940
Italy      60.665
Name: G7 Population in millions, dtype: float64

### Conditional selection (boolean series)

In [23]:
#returns a boolean serie
g7_pop > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: G7 Population in millions, dtype: bool

In [20]:
#restricting the series with boolean operators
g7_pop[g7_pop > 70]

Germany           80.940
Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [22]:
#to get answer in terms of people and not millions of people
g7_pop * 1000000

Canada             35467000.0
France             63951000.0
Germany            80940000.0
Italy              60665000.0
Japan             127061000.0
United Kingdom     64511000.0
United States     318523000.0
Name: G7 Population in millions, dtype: float64

In [24]:
g7_pop.mean()

107.30257142857144

In [25]:
g7_pop[g7_pop > g7_pop.mean()]

Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [26]:
g7_pop.std()

97.24996987121581

~ not, | or, & and.

In [27]:
g7_pop[(g7_pop > g7_pop.mean() - g7_pop.std() / 2) | (g7_pop > g7_pop.mean() + g7_pop.std() / 2)]

France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

### Operations and methods

In [29]:
#other than mean, std, etc, can also use regular numpy math functions
np.log(g7_pop)

Canada            3.568603
France            4.158117
Germany           4.393708
Italy             4.105367
Japan             4.844667
United Kingdom    4.166836
United States     5.763695
Name: G7 Population in millions, dtype: float64

### Modifying series

In [30]:
g7_pop['Canada'] = 40.5

In [31]:
g7_pop

Canada             40.500
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [32]:
#modify based on boolean selections
g7_pop[g7_pop < 70] = 99.99

In [33]:
g7_pop

Canada             99.990
France             99.990
Germany            80.940
Italy              99.990
Japan             127.061
United Kingdom     99.990
United States     318.523
Name: G7 Population in millions, dtype: float64

### Exercises

#### create empty pandas Series

In [35]:
pd.Series()

  """Entry point for launching an IPython kernel.


Series([], dtype: float64)

#### given X python list, convert it to an Y pandas series

In [37]:
X = ['a','b','c']
Y = pd.Series(X)
print(Y, type(Y))

0    a
1    b
2    c
dtype: object <class 'pandas.core.series.Series'>


#### Given Y pandas series, name it 'My letters'

In [38]:
Y.name = 'My letters'
Y

0    a
1    b
2    c
Name: My letters, dtype: object

#### Show its values

In [39]:
Y.values

array(['a', 'b', 'c'], dtype=object)

#### assign it index names

In [41]:
Y.index = ['first letter','second letter','third letter']

In [42]:
Y

first letter     a
second letter    b
third letter     c
Name: My letters, dtype: object

#### show first element

In [44]:
Y.iloc[0]

'a'

#### show last element

In [45]:
Y.iloc[-1]

'c'

#### show all middle elements

In [51]:
Y.iloc[1:-1]

second letter    b
Name: My letters, dtype: object

#### show elements in reverse position

In [53]:
Y.iloc[::-1]

third letter     c
second letter    b
first letter     a
Name: My letters, dtype: object

#### show first and last elements only

In [54]:
Y.iloc[[0,-1]]

first letter    a
third letter    c
Name: My letters, dtype: object

#### convert integer pandas series to float

In [55]:
X = pd.Series([1,2,3,4,5], index=['first','second','third','forth','fifth'])
pd.Series(X, dtype=np.float)

first     1.0
second    2.0
third     3.0
forth     4.0
fifth     5.0
dtype: float64

#### reverse this pandas series

In [56]:
X.iloc[::-1]

fifth     5
forth     4
third     3
second    2
first     1
dtype: int64

#### order (sort) the series

In [60]:
X = X.sort_values()
X

first     1
second    2
third     3
forth     4
fifth     5
dtype: int64

#### set fifth element = 10

In [62]:
X['fifth'] = 10
X

first      1
second     2
third      3
forth      4
fifth     10
dtype: int64

#### change all middle elements to 0

In [64]:
X[1:-1]=0
X

first      1
second     0
third      0
forth      0
fifth     10
dtype: int64

#### add 5 to every element

In [65]:
X+5

first      6
second     5
third      5
forth      5
fifth     15
dtype: int64

#### make a mask showing negative elements

In [66]:
X = pd.Series([-1,2,0,-4,5,6,0,0,-9,10])

In [67]:
negative = X<0
negative

0     True
1    False
2    False
3     True
4    False
5    False
6    False
7    False
8     True
9    False
dtype: bool

#### get the negative elements

In [68]:
X[negative]

0   -1
3   -4
8   -9
dtype: int64

#### get numbers higher than 5

In [69]:
X[X>5]

5     6
9    10
dtype: int64

#### get elements higher than the elements mean

In [70]:
X[X>X.mean()]

1     2
4     5
5     6
9    10
dtype: int64

#### get numbers equal to 10 or 2

In [73]:
X[(X==10)|(X==2)]

1     2
9    10
dtype: int64

#### return True if none of its elements is zero

In [75]:
X = pd.Series([-1,2,0,-4,5,6,0,0,-9,10])
X.all()

False

#### return True if any of its elements is zero

In [76]:
X.any()

True

#### show the sum of its elements

In [77]:
X = pd.Series([3,4,6,7,2,3,4,9,4])
X.sum()

42

#### show mean value of its elements

In [78]:
X.mean()

4.666666666666667

#### show the max

In [80]:
X.max()

9

## DataFrames

- similar to tables
- created usually out of excel or cvs files
    - like a combination of multiple series, one per column

In [81]:
import numpy as np
import pandas as pd

In [82]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [83]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [85]:
#can also rename indexes
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


###  Methods

In [86]:
#find out which columns you have
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [87]:
#find out indexes
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [89]:
#quick info about structure of data frame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


if have a null value, that means you have an empty cell

In [90]:
df.size

35

In [92]:
#(rows, columns)
df.shape

(7, 5)

In [95]:
#summary of statistics of data frame
#only for columns with numerical data types
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [96]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

### Indexing, Selection and Slicing

In [101]:
#to select by index (rows)
df.loc['Canada']

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [102]:
#to select by index in sequential position(not using new names)
df.iloc[-1]

Population       318.523
GDP             17348075
Surface Area     9525067
HDI                0.915
Continent        America
Name: United States, dtype: object

In [103]:
#by column for all rows
df['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [105]:
#if want to format it nicely
df['Population'].to_frame()

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


In [106]:
#select multiple columns at once
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [108]:
#slicing works on a row level, slices the indexes
df[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [111]:
#actually better to use loc, includes upper limit
df.loc['France':'Italy']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [113]:
#pass second argument to get only a columns for these rows
df.loc['France':'Italy',['Population', 'GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744


In [114]:
#for iloc, slicing doesnt include upper limit
df.iloc[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [115]:
#second argument is number of column, starting at 0
df.iloc[1:3,3]

France     0.888
Germany    0.916
Name: HDI, dtype: float64

In [116]:
#multiple indexes
df.iloc[[0,1,3]]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Italy,60.665,2167744,301336,0.873,Europe


### Conditional selection (boolean arrays)

In [119]:
#this is like the mask
df['Population'] > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [120]:
#restricting using the mask
df.loc[df['Population']>70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [122]:
#also decide which columns to get
df.loc[df['Population'] > 70, ['Population', 'GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


### Dropping Stuff
- opposite of selecting, choose which ones you don't want

In [125]:
#drop by index
df.drop(['Canada', 'Japan'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [126]:
#drop columns
df.drop(columns=['Population', 'HDI'])

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


### Operations

In [128]:
#still normal vectorized operations
df[['Population', 'GDP']] / 100

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


In [130]:
#suppose these are the changes that happen to country during crisis
crisis = pd.Series([-1000000, -0.3], index=['GDP','HDI'])
crisis

GDP   -1000000.0
HDI         -0.3
dtype: float64

In [131]:
#if a country is in crisis we get
df[['GDP','HDI']]+crisis

Unnamed: 0,GDP,HDI
Canada,785387.0,0.613
France,1833687.0,0.588
Germany,2874437.0,0.616
Italy,1167744.0,0.573
Japan,3602367.0,0.591
United Kingdom,1950039.0,0.607
United States,16348075.0,0.615


### Modifying DataFrames
- dropping doesn't change dataframe, it creates new one with the resulting changes

In [143]:
#create new pandas series
languages = pd.Series(
    ['French','German','Italian'],
    index = ['France','Germany','Italy'],
    name = 'Language'
)
languages

France      French
Germany     German
Italy      Italian
Name: Language, dtype: object

In [144]:
#create new column and assign it the series
df['Language'] = languages
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Languages,Language
Canada,35.467,1785387,9984670,0.913,America,,
France,63.951,2833687,640679,0.888,Europe,French,French
Germany,80.94,3874437,357114,0.916,Europe,German,German
Italy,60.665,2167744,301336,0.873,Europe,Italian,Italian
Japan,127.061,4602367,377930,0.891,Asia,,
United Kingdom,64.511,2950039,242495,0.907,Europe,,
United States,318.523,17348075,9525067,0.915,America,,


In [145]:
#replacing values per column
df['Language']='English'
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Languages,Language
Canada,35.467,1785387,9984670,0.913,America,,English
France,63.951,2833687,640679,0.888,Europe,French,English
Germany,80.94,3874437,357114,0.916,Europe,German,English
Italy,60.665,2167744,301336,0.873,Europe,Italian,English
Japan,127.061,4602367,377930,0.891,Asia,,English
United Kingdom,64.511,2950039,242495,0.907,Europe,,English
United States,318.523,17348075,9525067,0.915,America,,English


### Renaming columns and indexes

In [148]:
#if doesn't already exist, there is no problem
df.rename(
    columns={
        'HDI': 'Human Development Index',
        'Anual Popcorn Consumption': 'APC'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Argentina': 'AR'
    })

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Languages,Language
Canada,35.467,1785387,9984670,0.913,America,,English
France,63.951,2833687,640679,0.888,Europe,French,English
Germany,80.94,3874437,357114,0.916,Europe,German,English
Italy,60.665,2167744,301336,0.873,Europe,Italian,English
Japan,127.061,4602367,377930,0.891,Asia,,English
UK,64.511,2950039,242495,0.907,Europe,,English
USA,318.523,17348075,9525067,0.915,America,,English


In [152]:
df.rename(index=str.upper)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Languages,Language
CANADA,35.467,1785387,9984670,0.913,America,,English
FRANCE,63.951,2833687,640679,0.888,Europe,French,English
GERMANY,80.94,3874437,357114,0.916,Europe,German,English
ITALY,60.665,2167744,301336,0.873,Europe,Italian,English
JAPAN,127.061,4602367,377930,0.891,Asia,,English
UNITED KINGDOM,64.511,2950039,242495,0.907,Europe,,English
UNITED STATES,318.523,17348075,9525067,0.915,America,,English


In [153]:
df.rename(index=lambda x: x.lower())

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Languages,Language
canada,35.467,1785387,9984670,0.913,America,,English
france,63.951,2833687,640679,0.888,Europe,French,English
germany,80.94,3874437,357114,0.916,Europe,German,English
italy,60.665,2167744,301336,0.873,Europe,Italian,English
japan,127.061,4602367,377930,0.891,Asia,,English
united kingdom,64.511,2950039,242495,0.907,Europe,,English
united states,318.523,17348075,9525067,0.915,America,,English


In [160]:
#deletes column for good
df.drop(columns='Language', inplace=True)

KeyError: "['Language'] not found in axis"

In [163]:
#add values (rows), returns new dataFrame
df.append(pd.Series({
    'Population': 3,
    'GDP':5
}, name = 'China'))

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387.0,9984670.0,0.913,America
France,63.951,2833687.0,640679.0,0.888,Europe
Germany,80.94,3874437.0,357114.0,0.916,Europe
Italy,60.665,2167744.0,301336.0,0.873,Europe
Japan,127.061,4602367.0,377930.0,0.891,Asia
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
United States,318.523,17348075.0,9525067.0,0.915,America
China,3.0,5.0,,,


In [166]:
#sets row's new values to the original dataFrame
df.loc['China'] = pd.Series({'Population': 1_400_000_000, 'Continent': 'Asia'})

In [165]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387.0,9984670.0,0.913,America
France,63.951,2833687.0,640679.0,0.888,Europe
Germany,80.94,3874437.0,357114.0,0.916,Europe
Italy,60.665,2167744.0,301336.0,0.873,Europe
Japan,127.061,4602367.0,377930.0,0.891,Asia
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
United States,318.523,17348075.0,9525067.0,0.915,America
China,1400000000.0,,,,Asia


In [167]:
#deleting China permanently
df.drop('China', inplace=True)
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387.0,9984670.0,0.913,America
France,63.951,2833687.0,640679.0,0.888,Europe
Germany,80.94,3874437.0,357114.0,0.916,Europe
Italy,60.665,2167744.0,301336.0,0.873,Europe
Japan,127.061,4602367.0,377930.0,0.891,Asia
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
United States,318.523,17348075.0,9525067.0,0.915,America


In [172]:
#adds new column for created index and returns new dataFrame with ordered number indexing
df.reset_index()

Unnamed: 0,index,Population,GDP,Surface Area,HDI,Continent
0,Canada,35.467,1785387.0,9984670.0,0.913,America
1,France,63.951,2833687.0,640679.0,0.888,Europe
2,Germany,80.94,3874437.0,357114.0,0.916,Europe
3,Italy,60.665,2167744.0,301336.0,0.873,Europe
4,Japan,127.061,4602367.0,377930.0,0.891,Asia
5,United Kingdom,64.511,2950039.0,242495.0,0.907,Europe
6,United States,318.523,17348075.0,9525067.0,0.915,America


In [171]:
#new indexing based on population data
df.set_index('Population')

Unnamed: 0_level_0,GDP,Surface Area,HDI,Continent
Population,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35.467,1785387.0,9984670.0,0.913,America
63.951,2833687.0,640679.0,0.888,Europe
80.94,3874437.0,357114.0,0.916,Europe
60.665,2167744.0,301336.0,0.873,Europe
127.061,4602367.0,377930.0,0.891,Asia
64.511,2950039.0,242495.0,0.907,Europe
318.523,17348075.0,9525067.0,0.915,America


### Creating new column from given columns

In [173]:
#GDP Per Capita
df['GDP Per Capita'] = df['GDP']/df['Population']
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP Per Capita
Canada,35.467,1785387.0,9984670.0,0.913,America,50339.385908
France,63.951,2833687.0,640679.0,0.888,Europe,44310.284437
Germany,80.94,3874437.0,357114.0,0.916,Europe,47868.013343
Italy,60.665,2167744.0,301336.0,0.873,Europe,35733.025633
Japan,127.061,4602367.0,377930.0,0.891,Asia,36221.712406
United Kingdom,64.511,2950039.0,242495.0,0.907,Europe,45729.239975
United States,318.523,17348075.0,9525067.0,0.915,America,54464.12033


### Statistical Info

In [175]:
#gives first few rows of data
df.head()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,GDP Per Capita
Canada,35.467,1785387.0,9984670.0,0.913,America,50339.385908
France,63.951,2833687.0,640679.0,0.888,Europe,44310.284437
Germany,80.94,3874437.0,357114.0,0.916,Europe,47868.013343
Italy,60.665,2167744.0,301336.0,0.873,Europe,35733.025633
Japan,127.061,4602367.0,377930.0,0.891,Asia,36221.712406


In [176]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI,GDP Per Capita
count,7.0,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429,44952.254576
std,97.24997,5494020.0,4576187.0,0.016592,6954.983875
min,35.467,1785387.0,242495.0,0.873,35733.025633
25%,62.308,2500716.0,329225.0,0.8895,40265.998421
50%,64.511,2950039.0,377930.0,0.907,45729.239975
75%,104.0005,4238402.0,5082873.0,0.914,49103.699626
max,318.523,17348080.0,9984670.0,0.916,54464.12033


In [177]:
population = df['Population']

In [178]:
population.min(), population.max()

(35.467, 318.523)

In [180]:
#because there is data from 7 countries
len(population)

7

In [181]:
population.sum()

751.118

In [182]:
population.mean()

107.30257142857144

In [183]:
population.median()

64.511

In [184]:
population.quantile(0.25)

62.308

In [185]:
population.quantile([.2,.4,.6,.8,1])

0.2     61.3222
0.4     64.1750
0.6     74.3684
0.8    117.8368
1.0    318.5230
Name: Population, dtype: float64

### Exercises

In [186]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#### create empty DataFrame

In [190]:
df = pd.DataFrame()

#### create marvel dataframe

In [191]:
marvel_data = [
    ['Spider-Man', 'male', 1962],
    ['Captain America', 'male', 1941],
    ['Wolverine', 'male', 1974],
    ['Iron Man', 'male', 1963],
    ['Thor', 'male', 1963],
    ['Thing', 'male', 1961],
    ['Mister Fantastic', 'male', 1961],
    ['Hulk', 'male', 1962],
    ['Beast', 'male', 1963],
    ['Invisible Woman', 'female', 1961],
    ['Storm', 'female', 1975],
    ['Namor', 'male', 1939],
    ['Hawkeye', 'male', 1964],
    ['Daredevil', 'male', 1964],
    ['Doctor Strange', 'male', 1963],
    ['Hank Pym', 'male', 1962],
    ['Scarlet Witch', 'female', 1964],
    ['Wasp', 'female', 1963],
    ['Black Widow', 'female', 1964],
    ['Vision', 'male', 1968]
]

In [193]:
marvel_df = pd.DataFrame(data=marvel_data)
marvel_df

Unnamed: 0,0,1,2
0,Spider-Man,male,1962
1,Captain America,male,1941
2,Wolverine,male,1974
3,Iron Man,male,1963
4,Thor,male,1963
5,Thing,male,1961
6,Mister Fantastic,male,1961
7,Hulk,male,1962
8,Beast,male,1963
9,Invisible Woman,female,1961


#### add column names

In [196]:
marvel_df.columns =['name','sex','year']
marvel_df

Unnamed: 0,name,sex,year
0,Spider-Man,male,1962
1,Captain America,male,1941
2,Wolverine,male,1974
3,Iron Man,male,1963
4,Thor,male,1963
5,Thing,male,1961
6,Mister Fantastic,male,1961
7,Hulk,male,1962
8,Beast,male,1963
9,Invisible Woman,female,1961


#### add index names (character name)

In [197]:
marvel_df.index = marvel_df['name']
marvel_df

Unnamed: 0_level_0,name,sex,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Spider-Man,Spider-Man,male,1962
Captain America,Captain America,male,1941
Wolverine,Wolverine,male,1974
Iron Man,Iron Man,male,1963
Thor,Thor,male,1963
Thing,Thing,male,1961
Mister Fantastic,Mister Fantastic,male,1961
Hulk,Hulk,male,1962
Beast,Beast,male,1963
Invisible Woman,Invisible Woman,female,1961


#### drop name column, as it is now the index

In [199]:
marvel_df.drop(columns='name', inplace=True)
marvel_df

Unnamed: 0_level_0,sex,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Spider-Man,male,1962
Captain America,male,1941
Wolverine,male,1974
Iron Man,male,1963
Thor,male,1963
Thing,male,1961
Mister Fantastic,male,1961
Hulk,male,1962
Beast,male,1963
Invisible Woman,female,1961


In [204]:
#accidentally clicked on it twice, but correct
marvel_df.drop(['Namor','Hank Pym'], inplace=True)

KeyError: "['Namor' 'Hank Pym'] not found in axis"

In [205]:
marvel_df

Unnamed: 0_level_0,sex,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Spider-Man,male,1962
Captain America,male,1941
Wolverine,male,1974
Iron Man,male,1963
Thor,male,1963
Thing,male,1961
Mister Fantastic,male,1961
Hulk,male,1962
Beast,male,1963
Invisible Woman,female,1961


#### show first 5 elements

In [209]:
marvel_df.iloc[0:5]

Unnamed: 0_level_0,sex,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Spider-Man,male,1962
Captain America,male,1941
Wolverine,male,1974
Iron Man,male,1963
Thor,male,1963


#### show last 5 elements

In [214]:
marvel_df.iloc[-5:]

Unnamed: 0_level_0,sex,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Doctor Strange,male,1963
Scarlet Witch,female,1964
Wasp,female,1963
Black Widow,female,1964
Vision,male,1968


#### show just sex of first 5 elements

In [215]:
marvel_df.iloc[:5].sex.to_frame()

Unnamed: 0_level_0,sex
name,Unnamed: 1_level_1
Spider-Man,male
Captain America,male
Wolverine,male
Iron Man,male
Thor,male


#### show year of all middle elements

In [216]:
marvel_df.iloc[1:-1].year.to_frame()

Unnamed: 0_level_0,year
name,Unnamed: 1_level_1
Captain America,1941
Wolverine,1974
Iron Man,1963
Thor,1963
Thing,1961
Mister Fantastic,1961
Hulk,1962
Beast,1963
Invisible Woman,1961
Storm,1975


#### show first and last elements

In [218]:
marvel_df.iloc[[0,-1]]

Unnamed: 0_level_0,sex,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Spider-Man,male,1962
Vision,male,1968


#### modify year of 'Vision' to 1964

In [219]:
marvel_df.loc['Vision', 'year'] = 1964
marvel_df

Unnamed: 0_level_0,sex,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Spider-Man,male,1962
Captain America,male,1941
Wolverine,male,1974
Iron Man,male,1963
Thor,male,1963
Thing,male,1961
Mister Fantastic,male,1961
Hulk,male,1962
Beast,male,1963
Invisible Woman,female,1961


#### add new column called 'years_since' with years since first appearance

In [220]:
marvel_df['years_since']=2020-marvel_df['year']
marvel_df

Unnamed: 0_level_0,sex,year,years_since
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Spider-Man,male,1962,58
Captain America,male,1941,79
Wolverine,male,1974,46
Iron Man,male,1963,57
Thor,male,1963,57
Thing,male,1961,59
Mister Fantastic,male,1961,59
Hulk,male,1962,58
Beast,male,1963,57
Invisible Woman,female,1961,59


#### make mask showing female characters

In [222]:
female = marvel_df['sex'] == 'female'

#### get male characters

In [229]:
marvel_df[marvel_df['sex']=='male']

Unnamed: 0_level_0,sex,year,years_since
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Spider-Man,male,1962,58
Captain America,male,1941,79
Wolverine,male,1974,46
Iron Man,male,1963,57
Thor,male,1963,57
Thing,male,1961,59
Mister Fantastic,male,1961,59
Hulk,male,1962,58
Beast,male,1963,57
Hawkeye,male,1964,56


#### get characters with year after 1970

In [230]:
marvel_df.loc[marvel_df['year']>1970]

Unnamed: 0_level_0,sex,year,years_since
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Wolverine,male,1974,46
Storm,female,1975,45


#### get females with year after 1970

In [231]:
marvel_df.loc[(marvel_df['year']>1970)&(marvel_df['sex']=='female')]

Unnamed: 0_level_0,sex,year,years_since
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Storm,female,1975,45


#### show basic statistics

In [232]:
marvel_df.describe()

Unnamed: 0,year,years_since
count,18.0,18.0
mean,1962.888889,57.111111
std,6.720372,6.720372
min,1941.0,45.0
25%,1962.0,56.0
50%,1963.0,57.0
75%,1964.0,58.0
max,1975.0,79.0


#### show mean. min of year

In [233]:
marvel_df['year'].mean()

1962.888888888889

In [234]:
marvel_df['year'].min()

1941

#### get characters with this min value of year

In [235]:
marvel_df.loc[marvel_df['year']==marvel_df['year'].min()]

Unnamed: 0_level_0,sex,year,years_since
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Captain America,male,1941,79


#### reset index and plot year

In [None]:
marvel_df = marvel