# Code alongs merging

In [3]:
import pandas as pd 
import numpy as np 

np.arange(16).reshape(4, 4)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [5]:
np.zeros((3,4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [6]:
np.ones((4,3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [10]:
df1 = pd.DataFrame(np.arange(16).reshape(4, 4), columns = list("ABCD"))
df1

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [12]:
df2 = pd.DataFrame(np.zeros((3,4)), columns=list("ABCD"))
df2

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0


In [13]:
df3 = pd.DataFrame(np.ones((4,3)), index = [1,5,10,15])
df3

Unnamed: 0,0,1,2
1,1.0,1.0,1.0
5,1.0,1.0,1.0
10,1.0,1.0,1.0
15,1.0,1.0,1.0


---
## Concat

- concatenates pandas objects along an axis (rows, columns)
- there are set logics 

In [14]:
df1, df2

(    A   B   C   D
 0   0   1   2   3
 1   4   5   6   7
 2   8   9  10  11
 3  12  13  14  15,
      A    B    C    D
 0  0.0  0.0  0.0  0.0
 1  0.0  0.0  0.0  0.0
 2  0.0  0.0  0.0  0.0)

In [None]:
pd.concat([df1, df2], axis = 0)

In [18]:
# puts all cols from df1 and then all cols from df2
# those indices that are missing in one df is filled with NaNs
pd.concat([df1, df2], axis = "columns")

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,0,1,2,3,0.0,0.0,0.0,0.0
1,4,5,6,7,0.0,0.0,0.0,0.0
2,8,9,10,11,0.0,0.0,0.0,0.0
3,12,13,14,15,,,,


In [19]:
df1, df3

(    A   B   C   D
 0   0   1   2   3
 1   4   5   6   7
 2   8   9  10  11
 3  12  13  14  15,
       0    1    2
 1   1.0  1.0  1.0
 5   1.0  1.0  1.0
 10  1.0  1.0  1.0
 15  1.0  1.0  1.0)

In [25]:
# concatenates in axis rows, df1 rows first then df2 rows, NaNs are used for missing values
pd.concat([df1, df3],axis = "rows")

Unnamed: 0,A,B,C,D,0,1,2
0,0.0,1.0,2.0,3.0,,,
1,4.0,5.0,6.0,7.0,,,
2,8.0,9.0,10.0,11.0,,,
3,12.0,13.0,14.0,15.0,,,
1,,,,,1.0,1.0,1.0
5,,,,,1.0,1.0,1.0
10,,,,,1.0,1.0,1.0
15,,,,,1.0,1.0,1.0


In [23]:
# per default: outer join
pd.concat([df1, df3], axis = "columns")

Unnamed: 0,A,B,C,D,0,1,2
0,0.0,1.0,2.0,3.0,,,
1,4.0,5.0,6.0,7.0,1.0,1.0,1.0
2,8.0,9.0,10.0,11.0,,,
3,12.0,13.0,14.0,15.0,,,
5,,,,,1.0,1.0,1.0
10,,,,,1.0,1.0,1.0
15,,,,,1.0,1.0,1.0


In [24]:
# intersection between df1 and df2 on the indices 
pd.concat([df1, df3], axis = "columns", join = "inner")

Unnamed: 0,A,B,C,D,0,1,2
1,4,5,6,7,1.0,1.0,1.0


---
## Merge

In [26]:
left = pd.DataFrame(
    {
        "key": ["K0", "K0", "K1", "K2"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    }
)

right = pd.DataFrame(
    {
        "key": ["K0", "K1", "K2", "K3"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)

left, right

(  key   A   B
 0  K0  A0  B0
 1  K0  A1  B1
 2  K1  A2  B2
 3  K2  A3  B3,
   key   C   D
 0  K0  C0  D0
 1  K1  C1  D1
 2  K2  C2  D2
 3  K3  C3  D3)

In [30]:
# syntax with function
# inner join -> K3 from right not included 
pd.merge(left, right, on = "key", indicator=True)

Unnamed: 0,key,A,B,C,D,_merge
0,K0,A0,B0,C0,D0,both
1,K0,A1,B1,C0,D0,both
2,K1,A2,B2,C1,D1,both
3,K2,A3,B3,C2,D2,both


In [31]:
# syntax with method
left.merge(right, on = "key", indicator=True)

Unnamed: 0,key,A,B,C,D,_merge
0,K0,A0,B0,C0,D0,both
1,K0,A1,B1,C0,D0,both
2,K1,A2,B2,C1,D1,both
3,K2,A3,B3,C2,D2,both


In [33]:
# note that left does not have a K3, so in row index 4 only merge from right
# outer join -> union between left and right
left.merge(right, on = "key", how = "outer", indicator=True)

Unnamed: 0,key,A,B,C,D,_merge
0,K0,A0,B0,C0,D0,both
1,K0,A1,B1,C0,D0,both
2,K1,A2,B2,C1,D1,both
3,K2,A3,B3,C2,D2,both
4,K3,,,C3,D3,right_only


In [34]:
right

Unnamed: 0,key,C,D
0,K0,C0,D0
1,K1,C1,D1
2,K2,C2,D2
3,K3,C3,D3


In [35]:
left = pd.DataFrame(
    {
        "key1": ["K0", "K0", "K1", "K2"],
        "key2": ["K0", "K1", "K0", "K1"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    }
)

right = pd.DataFrame(
    {
        "key1": ["K0", "K1", "K1", "K2"],
        "key2": ["K0", "K0", "K0", "K0"],
        "A": ["A0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)

left, right

(  key1 key2   A   B
 0   K0   K0  A0  B0
 1   K0   K1  A1  B1
 2   K1   K0  A2  B2
 3   K2   K1  A3  B3,
   key1 key2   A   D
 0   K0   K0  A0  D0
 1   K1   K0  C1  D1
 2   K1   K0  C2  D2
 3   K2   K0  C3  D3)

In [39]:
# merge on both key1 and key2 columns, and an inner join - intersection
left.merge(right, on = ["key1", "key2"], indicator=True)

Unnamed: 0,key1,key2,A_x,B,A_y,D,_merge
0,K0,K0,A0,B0,A0,D0,both
1,K1,K0,A2,B2,C1,D1,both
2,K1,K0,A2,B2,C2,D2,both


In [41]:
left, right

(  key1 key2   A   B
 0   K0   K0  A0  B0
 1   K0   K1  A1  B1
 2   K1   K0  A2  B2
 3   K2   K1  A3  B3,
   key1 key2   A   D
 0   K0   K0  A0  D0
 1   K1   K0  C1  D1
 2   K1   K0  C2  D2
 3   K2   K0  C3  D3)

In [42]:
# A_x is from the left, A_y is from the right
left.merge(right, on = ["key1", "key2"], indicator=True, how = "left")

Unnamed: 0,key1,key2,A_x,B,A_y,D,_merge
0,K0,K0,A0,B0,A0,D0,both
1,K0,K1,A1,B1,,,left_only
2,K1,K0,A2,B2,C1,D1,both
3,K1,K0,A2,B2,C2,D2,both
4,K2,K1,A3,B3,,,left_only


In [44]:
left.merge(right, on = ["key1", "key2"], indicator=True, how = "right", suffixes=["_left", "_right"])

Unnamed: 0,key1,key2,A_left,B,A_right,D,_merge
0,K0,K0,A0,B0,A0,D0,both
1,K1,K0,A2,B2,C1,D1,both
2,K1,K0,A2,B2,C2,D2,both
3,K2,K0,,,C3,D3,right_only


---
## Join

- uses merge internally
- combines columns 

In [45]:
left = pd.DataFrame(
    {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=["K0", "K1", "K2"]
)


right = pd.DataFrame(
    {"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]}, index=["K0", "K2", "K3"]
)

left, right

(     A   B
 K0  A0  B0
 K1  A1  B1
 K2  A2  B2,
      C   D
 K0  C0  D0
 K2  C2  D2
 K3  C3  D3)

In [48]:
# note we join on K1 which only exists in left -> so this is a left join
left.join(right)

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


In [49]:
# indices from right -> right join
left.join(right, how = "right")

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K2,A2,B2,C2,D2
K3,,,C3,D3


---
## Webscraping HTML tables

In [54]:
fifa_tables = pd.read_html("https://en.wikipedia.org/wiki/FIFA")
len(fifa_tables), type(fifa_tables)

(31, list)

In [56]:
fifa_tables[0].head()

Unnamed: 0,0,1
0,Fédération internationale de Football Associat...,Fédération internationale de Football Associat...
1,Logo,Logo
2,Map of the members of FIFA according to their ...,Map of the members of FIFA according to their ...
3,Abbreviation,FIFA[1]
4,Founded,21 May 1904; 118 years ago


In [58]:
fifa_tables[1]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1
0,,
1,Use,Sport
2,Proportion,3:5
3,Adopted,2018
4,Design,Blue field with a FIFA logo


In [60]:
fifa_tables[2].head()

Unnamed: 0,No.,Name,Country,Took office,Left office,Note
0,1,Robert Guérin,France,23 May 1904,4 June 1906,
1,2,Daniel Burley Woolfall,United Kingdom,4 June 1906,24 October 1918,Died in office
2,—,Cornelis August Wilhelm Hirschman,Netherlands,24 October 1918,1920,Acting
3,3,Jules Rimet,France,1 March 1921,21 June 1954,
4,4,Rodolphe Seeldrayers,Belgium,21 June 1954,7 October 1955,Died in office


In [62]:
title_holders = pd.read_html("https://en.wikipedia.org/wiki/FIFA", match = "Runners-up")

len(title_holders)

1

In [65]:
title_holders = pd.read_html("https://en.wikipedia.org/wiki/FIFA", match = "Runners-up", header = 0)[0]

title_holders.head()

Unnamed: 0,Competition,Unnamed: 1,Year,Champions,Title,Runners-up,Unnamed: 6,Next edition[55]
0,National teams,National teams,National teams,National teams,National teams,National teams,National teams,National teams
1,FIFA World Cup,,2018 (Final),France,2nd,Croatia,,2022 (Final)
2,Men's Olympic Football Tournament (U-23),,2020 (Final),Brazil,2nd,Spain,,2024 (Final)
3,FIFA U-20 World Cup,,2019 (Final),Ukraine,1st,South Korea,,2023 (Final)
4,FIFA U-17 World Cup,,2019 (Final),Brazil,4th,Mexico,,2023 (Final)


In [67]:
title_holders["Competition"]

0                                        National teams
1                                        FIFA World Cup
2              Men's Olympic Football Tournament (U-23)
3                                   FIFA U-20 World Cup
4                                   FIFA U-17 World Cup
5                                 FIFA Futsal World Cup
6          Men's Youth Olympic Futsal Tournament (U-20)
7            FIFA Beach Soccer World Cup (see the BSWW)
8     FIFA Arab Cup (senior teams of the UAFA (Arab ...
9                                National teams (women)
10                               FIFA Women's World Cup
11                  Women's Olympic Football Tournament
12                          FIFA U-20 Women's World Cup
13                          FIFA U-17 Women's World Cup
14       Women's Youth Olympic Futsal Tournament (U-20)
15                                            Club team
16                                  FIFA Club World Cup
17                            Blue Stars/FIFA Yo