# In this exercise, we will explore the different ways to create, manipulate, and access data in Pandas DataFrames and Series
World population data adapted from the World Bank:<br />
https://data.worldbank.org/indicator/SP.POP.TOTL

## DataFrames and Series

In [2]:
import numpy as np
import pandas as pd

### From a dictionary

In [9]:
pop_dict = {
    "Country": ["Sub-Saharan Africa", "Finland", "Gabon", "Burkina Faso", "Pakistan"],
    "1960": [227948869.0, 4429634.0, 513340.0, 4783259.0, 45954226.0],
    "1970": [293900563.0, 4606307.0, 597192.0, 5611666.0, 59290872.0],
    "1980": [388793467.0, 4779535.0, 749078.0, 6932967.0, 80624057.0],
    "1990": [516629688.0, 4986431.0, 983028.0, 9131361.0, 115414069.0],
    "2000": [671212486.0, 5176209.0, 1272935.0, 11882888.0, 154369924.0],
    "2010": [879797419.0, 5363352.0, 1711105.0, 16116845.0, 194454498.0],
    "2020": [1151302081.0, 5529543.0, 2292573.0, 21522626.0, 227196741.0]
}

In [10]:
# Create df from dictionary
df = pd.DataFrame(pop_dict)
df.head()

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020
0,Sub-Saharan Africa,227948869.0,293900563.0,388793467.0,516629688.0,671212486.0,879797419.0,1151302000.0
1,Finland,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0
2,Gabon,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0
3,Burkina Faso,4783259.0,5611666.0,6932967.0,9131361.0,11882888.0,16116845.0,21522630.0
4,Pakistan,45954226.0,59290872.0,80624057.0,115414069.0,154369924.0,194454498.0,227196700.0


In [5]:
# Use name of column in brackets to return series
df["Country"]

0    Sub-Saharan Africa
1               Finland
2                 Gabon
3          Burkina Faso
4              Pakistan
Name: Country, dtype: object

In [5]:
# Can also use dot notation
df.Country

0    Sub-Saharan Africa
1               Finland
2                 Gabon
3          Burkina Faso
4              Pakistan
Name: Country, dtype: object

In [6]:
# Can't use dot notation with numbers (or spaces)
df.1960

SyntaxError: invalid syntax (95630150.py, line 2)

In [6]:
# Get a list of indices/columns
print(df.index)
print(df.columns)

RangeIndex(start=0, stop=5, step=1)
Index(['Country', '1960', '1970', '1980', '1990', '2000', '2010', '2020'], dtype='object')


In [7]:
# Series also have indices
df["Country"].index

RangeIndex(start=0, stop=5, step=1)

In [8]:
# Can return as a list
df.columns.to_list()

['Country', '1960', '1970', '1980', '1990', '2000', '2010', '2020']

In [10]:
# Or a numpy array
df["1960"].to_numpy()

array([2.27948869e+08, 4.42963400e+06, 5.13340000e+05, 4.78325900e+06,
       4.59542260e+07])

In [11]:
# Setting a new index
df.set_index(pd.Index(["a", "b", "c", "d", "e"]))

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020
a,Sub-Saharan Africa,227948869.0,293900563.0,388793467.0,516629688.0,671212486.0,879797419.0,1151302000.0
b,Finland,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0
c,Gabon,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0
d,Burkina Faso,4783259.0,5611666.0,6932967.0,9131361.0,11882888.0,16116845.0,21522630.0
e,Pakistan,45954226.0,59290872.0,80624057.0,115414069.0,154369924.0,194454498.0,227196700.0


In [12]:
# Setting the index is not in place, so the df still looks the same
df.head()

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020
0,Sub-Saharan Africa,227948869.0,293900563.0,388793467.0,516629688.0,671212486.0,879797419.0,1151302000.0
1,Finland,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0
2,Gabon,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0
3,Burkina Faso,4783259.0,5611666.0,6932967.0,9131361.0,11882888.0,16116845.0,21522630.0
4,Pakistan,45954226.0,59290872.0,80624057.0,115414069.0,154369924.0,194454498.0,227196700.0


In [13]:
# If we store the return as a variable, it persists
df = df.set_index("Country")
df.head()

Unnamed: 0_level_0,1960,1970,1980,1990,2000,2010,2020
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Sub-Saharan Africa,227948869.0,293900563.0,388793467.0,516629688.0,671212486.0,879797419.0,1151302000.0
Finland,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0
Gabon,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0
Burkina Faso,4783259.0,5611666.0,6932967.0,9131361.0,11882888.0,16116845.0,21522630.0
Pakistan,45954226.0,59290872.0,80624057.0,115414069.0,154369924.0,194454498.0,227196700.0


In [14]:
# Reset the index. Note the old index is now a column
df = df.reset_index()
df.head()

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020
0,Sub-Saharan Africa,227948869.0,293900563.0,388793467.0,516629688.0,671212486.0,879797419.0,1151302000.0
1,Finland,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0
2,Gabon,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0
3,Burkina Faso,4783259.0,5611666.0,6932967.0,9131361.0,11882888.0,16116845.0,21522630.0
4,Pakistan,45954226.0,59290872.0,80624057.0,115414069.0,154369924.0,194454498.0,227196700.0


## Other ways to create a DataFrame

In [16]:
pop_lists = [
    ["Sub-Saharan Africa", 227948869.0, 293900563.0, 388793467.0, 516629688.0, 671212486.0, 879797419.0, 1151302081.0],
    ["Finland", 4429634.0, 4606307.0, 4779535.0, 4986431.0, 5176209.0, 5363352.0, 5529543.0],
    ["Gabon", 513340.0, 597192.0, 749078.0, 983028.0, 1272935.0, 1711105.0, 2292573.0],
    ["Burkina Faso", 4783259.0, 5611666.0, 6932967.0, 9131361.0, 11882888.0, 16116845.0, 21522626.0],
    ["Pakistan", 45954226.0, 59290872.0, 80624057.0, 115414069.0, 154369924.0, 194454498.0, 227196741.0]
]

In [17]:
# From list of lists
df = pd.DataFrame(pop_lists, columns=['Country', '1960', '1970', '1980', '1990', '2000', '2010', '2020'])
df.head()

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020
0,Sub-Saharan Africa,227948869.0,293900563.0,388793467.0,516629688.0,671212486.0,879797419.0,1151302000.0
1,Finland,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0
2,Gabon,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0
3,Burkina Faso,4783259.0,5611666.0,6932967.0,9131361.0,11882888.0,16116845.0,21522630.0
4,Pakistan,45954226.0,59290872.0,80624057.0,115414069.0,154369924.0,194454498.0,227196700.0


In [39]:
pop_array = np.array([[227948869.0, 293900563.0, 388793467.0, 516629688.0, 671212486.0, 879797419.0, 1151302081.0],
                      [4429634.0, 4606307.0, 4779535.0, 4986431.0, 5176209.0, 5363352.0, 5529543.0],
                      [513340.0, 597192.0, 749078.0, 983028.0, 1272935.0, 1711105.0, 2292573.0],
                      [4783259.0, 5611666.0, 6932967.0, 9131361.0, 11882888.0, 16116845.0, 21522626.0],
                      [45954226.0, 59290872.0, 80624057.0, 115414069.0, 154369924.0, 194454498.0, 227196741.0]])

In [40]:
# Above doesn't include the countries
countries = ["Sub-Saharan Africa", "Finland", "Gabon", "Burkina Faso", "Pakistan"]

In [41]:
# From numpy array
df = pd.DataFrame(pop_array, columns=['1960', '1970', '1980', '1990', '2000', '2010', '2020']) #, index=countries)
df.head()

Unnamed: 0,1960,1970,1980,1990,2000,2010,2020
0,227948869.0,293900563.0,388793467.0,516629688.0,671212486.0,879797419.0,1151302000.0
1,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0
2,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0
3,4783259.0,5611666.0,6932967.0,9131361.0,11882888.0,16116845.0,21522630.0
4,45954226.0,59290872.0,80624057.0,115414069.0,154369924.0,194454498.0,227196700.0


In [42]:
# Create new column from a list
df["Country"] = countries
df.head()

Unnamed: 0,1960,1970,1980,1990,2000,2010,2020,Country
0,227948869.0,293900563.0,388793467.0,516629688.0,671212486.0,879797419.0,1151302000.0,Sub-Saharan Africa
1,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0,Finland
2,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0,Gabon
3,4783259.0,5611666.0,6932967.0,9131361.0,11882888.0,16116845.0,21522630.0,Burkina Faso
4,45954226.0,59290872.0,80624057.0,115414069.0,154369924.0,194454498.0,227196700.0,Pakistan


# Moving columns

In [43]:
# Want to move country column to start
columns = df.columns.to_list()
columns.insert(0, columns.pop())
print(columns)

['Country', '1960', '1970', '1980', '1990', '2000', '2010', '2020']


In [44]:
# Passing the list to the df in brackets returns a df with those columns in the order they were listed
df = df[columns]
df.head()

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020
0,Sub-Saharan Africa,227948869.0,293900563.0,388793467.0,516629688.0,671212486.0,879797419.0,1151302000.0
1,Finland,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0
2,Gabon,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0
3,Burkina Faso,4783259.0,5611666.0,6932967.0,9131361.0,11882888.0,16116845.0,21522630.0
4,Pakistan,45954226.0,59290872.0,80624057.0,115414069.0,154369924.0,194454498.0,227196700.0


# Loading data

In [57]:
# Load csv file
data = pd.read_csv("world_pop.csv")
data.head()

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020
0,Aruba,54608.0,59106.0,62267.0,65712.0,89101.0,100341.0,106585.0
1,Africa Eastern and Southern,130692579.0,172475766.0,230967858.0,309890664.0,401600588.0,523459657.0,685112979.0
2,Afghanistan,8622466.0,10752971.0,12486631.0,10694796.0,19542982.0,28189672.0,38972230.0
3,Africa Western and Central,97256290.0,121424797.0,157825609.0,206739024.0,269611898.0,356337762.0,466189102.0
4,Angola,5357195.0,6029700.0,8330047.0,11828638.0,16394062.0,23364185.0,33428486.0


# DIDN'T GET TO THIS STUFF

# Sampling data

In [79]:
# Random sample
df2 = data.sample(50, random_state=12345)
df2.head()

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020
229,Chad,3028688.0,3667394.0,4408230.0,5827069.0,8259137.0,11894727.0,16644701.0
175,Nicaragua,1789684.0,2444767.0,3303309.0,4227820.0,5123222.0,5855734.0,6755895.0
199,French Polynesia,84851.0,117891.0,163591.0,211089.0,250927.0,283788.0,301920.0
220,Suriname,268396.0,379918.0,375112.0,412756.0,478998.0,546080.0,607065.0
50,Cuba,7267395.0,8869636.0,9809107.0,10626680.0,11105791.0,11290417.0,11300698.0


In [1]:
df2.info()

NameError: name 'df2' is not defined

In [86]:
# Rename column (passed as dictionary)
# try:
df2 = df2.rename(columns={"Country": "Country Name"})
df2.head()
# except:
# 	print("No colum as Country")

Unnamed: 0,Country Name,1960,1970,1980,1990,2000,2010,2020
229,Chad,3028688.0,3667394.0,4408230.0,5827069.0,8259137.0,11894727.0,16644701.0
175,Nicaragua,1789684.0,2444767.0,3303309.0,4227820.0,5123222.0,5855734.0,6755895.0
199,French Polynesia,84851.0,117891.0,163591.0,211089.0,250927.0,283788.0,301920.0
220,Suriname,268396.0,379918.0,375112.0,412756.0,478998.0,546080.0,607065.0
50,Cuba,7267395.0,8869636.0,9809107.0,10626680.0,11105791.0,11290417.0,11300698.0


In [81]:
# Remove column
try:
	df2 = df2.drop(columns=["Country Name 1"])
	df2.head()
except:
	print("No column: Country Name 1")

No column: Country Name 1


In [85]:
# Resetting index makes old index a column
df2.reset_index()

Unnamed: 0,index,Country Name,1960,1970,1980,1990,2000,2010,2020
0,229,Chad,3028688.0,3667394.0,4408230.0,5827069.0,8259137.0,11894730.0,16644700.0
1,175,Nicaragua,1789684.0,2444767.0,3303309.0,4227820.0,5123222.0,5855734.0,6755895.0
2,199,French Polynesia,84851.0,117891.0,163591.0,211089.0,250927.0,283788.0,301920.0
3,220,Suriname,268396.0,379918.0,375112.0,412756.0,478998.0,546080.0,607065.0
4,50,Cuba,7267395.0,8869636.0,9809107.0,10626680.0,11105790.0,11290420.0,11300700.0
5,140,Low & middle income,2243179000.0,2805573000.0,3469529000.0,4246144000.0,5017754000.0,5760865000.0,6550747000.0
6,134,Latin America & Caribbean,219142600.0,285779000.0,361451200.0,441597500.0,521281100.0,588873900.0,650535000.0
7,190,Poland,29637450.0,32664300.0,35574150.0,38110780.0,38258630.0,38042790.0,37899070.0
8,85,Guinea,3516814.0,4222374.0,4972609.0,6354145.0,8336967.0,10270730.0,13205150.0
9,219,Sao Tome and Principe,68038.0,77583.0,97210.0,120343.0,143714.0,182138.0,218641.0


In [88]:
# Use drop argument to prevent that
df2 = df2.reset_index(drop=True)
df2.head()

Unnamed: 0,Country Name,1960,1970,1980,1990,2000,2010,2020
0,Chad,3028688.0,3667394.0,4408230.0,5827069.0,8259137.0,11894727.0,16644701.0
1,Nicaragua,1789684.0,2444767.0,3303309.0,4227820.0,5123222.0,5855734.0,6755895.0
2,French Polynesia,84851.0,117891.0,163591.0,211089.0,250927.0,283788.0,301920.0
3,Suriname,268396.0,379918.0,375112.0,412756.0,478998.0,546080.0,607065.0
4,Cuba,7267395.0,8869636.0,9809107.0,10626680.0,11105791.0,11290417.0,11300698.0


In [91]:
# Concatenate two dfs. Note the index from each carries over
pd.concat([df, df2], axis=0)

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020,Country Name
0,Sub-Saharan Africa,227948900.0,293900600.0,388793500.0,516629700.0,671212500.0,879797400.0,1151302000.0,
1,Finland,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0,
2,Gabon,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0,
3,Burkina Faso,4783259.0,5611666.0,6932967.0,9131361.0,11882890.0,16116840.0,21522630.0,
4,Pakistan,45954230.0,59290870.0,80624060.0,115414100.0,154369900.0,194454500.0,227196700.0,
0,,3028688.0,3667394.0,4408230.0,5827069.0,8259137.0,11894730.0,16644700.0,Chad
1,,1789684.0,2444767.0,3303309.0,4227820.0,5123222.0,5855734.0,6755895.0,Nicaragua
2,,84851.0,117891.0,163591.0,211089.0,250927.0,283788.0,301920.0,French Polynesia
3,,268396.0,379918.0,375112.0,412756.0,478998.0,546080.0,607065.0,Suriname
4,,7267395.0,8869636.0,9809107.0,10626680.0,11105790.0,11290420.0,11300700.0,Cuba


In [90]:
# Ignore index to make unique indices
pd.concat([df, df2], axis=0, ignore_index=True)

Unnamed: 0,Country,1960,1970,1980,1990,2000,2010,2020,Country Name
0,Sub-Saharan Africa,227948900.0,293900600.0,388793500.0,516629700.0,671212500.0,879797400.0,1151302000.0,
1,Finland,4429634.0,4606307.0,4779535.0,4986431.0,5176209.0,5363352.0,5529543.0,
2,Gabon,513340.0,597192.0,749078.0,983028.0,1272935.0,1711105.0,2292573.0,
3,Burkina Faso,4783259.0,5611666.0,6932967.0,9131361.0,11882890.0,16116840.0,21522630.0,
4,Pakistan,45954230.0,59290870.0,80624060.0,115414100.0,154369900.0,194454500.0,227196700.0,
5,,3028688.0,3667394.0,4408230.0,5827069.0,8259137.0,11894730.0,16644700.0,Chad
6,,1789684.0,2444767.0,3303309.0,4227820.0,5123222.0,5855734.0,6755895.0,Nicaragua
7,,84851.0,117891.0,163591.0,211089.0,250927.0,283788.0,301920.0,French Polynesia
8,,268396.0,379918.0,375112.0,412756.0,478998.0,546080.0,607065.0,Suriname
9,,7267395.0,8869636.0,9809107.0,10626680.0,11105790.0,11290420.0,11300700.0,Cuba


In [93]:
# What happens if you concatenate dfs with different columns?
df3 = df.drop(columns=["1980"])
df4 = df2.drop(columns=["1990"])
pd.concat([df3, df4], ignore_index=True)

Unnamed: 0,Country,1960,1970,1990,2000,2010,2020,Country Name,1980
0,Sub-Saharan Africa,227948900.0,293900600.0,516629688.0,671212500.0,879797400.0,1151302000.0,,
1,Finland,4429634.0,4606307.0,4986431.0,5176209.0,5363352.0,5529543.0,,
2,Gabon,513340.0,597192.0,983028.0,1272935.0,1711105.0,2292573.0,,
3,Burkina Faso,4783259.0,5611666.0,9131361.0,11882890.0,16116840.0,21522630.0,,
4,Pakistan,45954230.0,59290870.0,115414069.0,154369900.0,194454500.0,227196700.0,,
5,,3028688.0,3667394.0,,8259137.0,11894730.0,16644700.0,Chad,4408230.0
6,,1789684.0,2444767.0,,5123222.0,5855734.0,6755895.0,Nicaragua,3303309.0
7,,84851.0,117891.0,,250927.0,283788.0,301920.0,French Polynesia,163591.0
8,,268396.0,379918.0,,478998.0,546080.0,607065.0,Suriname,375112.0
9,,7267395.0,8869636.0,,11105790.0,11290420.0,11300700.0,Cuba,9809107.0
