# <h1 align='center'>Extracting Data from Excel using Python</h1>
**<center><font color='grey'>Speaker: Samuel Oranyeli</font></center>**

### **Import libraries**

In [1]:
import pandas as pd
import numpy as np
import janitor
from openpyxl import load_workbook
from collections import defaultdict

### Reading Excel data into Pandas is easy : 

- Read a single file: 

In [2]:
pd.read_excel(io = "names.xlsx", sheet_name = "naija")

Unnamed: 0,Name,Age,Height
0,Tolu,24,2.0
1,Chukwuka,50,1.8
2,Ogor,15,1.5


- Read in multiple sheets from a single file: 

In [3]:
pd.read_excel(io = "names.xlsx", sheet_name = ["naija", "vikings"])

{'naija':        Name  Age  Height
 0      Tolu   24     2.0
 1  Chukwuka   50     1.8
 2      Ogor   15     1.5,
 'vikings':      Name  Age  Height
 0  ragnar   32     1.6
 1   bjorn   49     2.2
 2    loki  400     1.8}

- Read in all the sheets from a single file:

In [4]:
pd.read_excel(io = "names.xlsx", sheet_name= None)

{'naija':        Name  Age  Height
 0      Tolu   24     2.0
 1  Chukwuka   50     1.8
 2      Ogor   15     1.5,
 'vikings':      Name  Age  Height
 0  ragnar   32     1.6
 1   bjorn   49     2.2
 2    loki  400     1.8,
 'olympus':        Name  Age  Height
 0      zeus   65     1.9
 1  poseidon   26     2.2
 2    athena   30     1.4,
 'defined_tables':        Name  Age  Height  Unnamed: 3  Name.1  Age.1  Height.1  Unnamed: 7  \
 0      Tolu   24     2.0         NaN  ragnar     32       1.6         NaN   
 1  Chukwuka   50     1.8         NaN   bjorn     49       2.2         NaN   
 2      Ogor   15     1.5         NaN    loki    400       1.8         NaN   
 
      Name.2  Age.2  Height.2  
 0      zeus     65       1.9  
 1  poseidon     26       2.2  
 2    athena     30       1.4  ,
 'position':     Unnamed: 0  Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4  Unnamed: 5  \
 0          NaN         NaN       Name        Age     Height         NaN   
 1          NaN         NaN       Tol

For multiple sheets, a dictionary is returned, with the sheetname as key, and the dataframe as the value.

- If the sheets have the same columns, you can combine them into one dataframe:

In [5]:
excel_dfs = pd.read_excel(io = "names.xlsx", sheet_name= ["naija", "vikings", "olympus"])
pd.concat(excel_dfs)

Unnamed: 0,Unnamed: 1,Name,Age,Height
naija,0,Tolu,24,2.0
naija,1,Chukwuka,50,1.8
naija,2,Ogor,15,1.5
vikings,0,ragnar,32,1.6
vikings,1,bjorn,49,2.2
vikings,2,loki,400,1.8
olympus,0,zeus,65,1.9
olympus,1,poseidon,26,2.2
olympus,2,athena,30,1.4


- The sheet names become the indices of the new dataframe.

In [6]:
#cleaner output
pd.concat(excel_dfs).droplevel(-1).rename_axis(index="Sheet_name").reset_index()

Unnamed: 0,Sheet_name,Name,Age,Height
0,naija,Tolu,24,2.0
1,naija,Chukwuka,50,1.8
2,naija,Ogor,15,1.5
3,vikings,ragnar,32,1.6
4,vikings,bjorn,49,2.2
5,vikings,loki,400,1.8
6,olympus,zeus,65,1.9
7,olympus,poseidon,26,2.2
8,olympus,athena,30,1.4


In [7]:
#if data does not start at "A" or row 1
pd.read_excel("names.xlsx", sheet_name = "position", usecols = "C:E", header = 1, nrows=3)

Unnamed: 0,Name,Age,Height
0,Tolu,24,2.0
1,Chukwuka,50,1.8
2,Ogor,15,1.5


In [8]:
# read in all excel files in a folder
# you can read it in as a dictionary
from pathlib import Path
folder = Path("excel_files")
excel_data = {filename.stem : pd.read_excel(filename) for filename in folder.iterdir()}
excel_data

{'naija':        Name  Age  Height
 0      Tolu   24     2.0
 1  Chukwuka   50     1.8
 2      Ogor   15     1.5,
 'vikings':      Name  Age  Height
 0  ragnar   32     1.6
 1   bjorn   49     2.2
 2    loki  400     1.8,
 'olympus':        Name  Age  Height
 0      zeus   65     1.9
 1  poseidon   26     2.2
 2    athena   30     1.4}

In [9]:
# read only the excel files in a folder
folder = Path("mixed_files").rglob("*.[xX]*")
excel_data = {filename.stem : pd.read_excel(filename) 
                              for filename in folder}
excel_data

{'naija':        Name  Age  Height
 0      Tolu   24     2.0
 1  Chukwuka   50     1.8
 2      Ogor   15     1.5,
 'vikings':      Name  Age  Height
 0  ragnar   32     1.6
 1   bjorn   49     2.2
 2    loki  400     1.8,
 'olympus':        Name  Age  Height
 0      zeus   65     1.9
 1  poseidon   26     2.2
 2    athena   30     1.4}

## Pivot Table with multiple headers

In [10]:
pd.read_excel('titanic.xlsx')

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Age,Child,Unnamed: 4,Adult,Unnamed: 6
0,,,Survived,No,Yes,No,Yes
1,Class,Sex,,,,,
2,1st,Male,,0,5,118,57
3,,Female,,0,1,4,140
4,2nd,Male,,0,11,154,14
5,,Female,,0,13,13,80
6,3rd,Male,,35,13,387,75
7,,Female,,17,14,89,76
8,Crew,Male,,0,0,670,192
9,,Female,,0,0,3,20


In [11]:
pd.read_excel('titanic.xlsx', header = [0,1], index_col=[0,1])

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Child,Child,Adult,Adult
Unnamed: 0_level_1,Unnamed: 1_level_1,Survived,No,Yes,No,Yes
Class,Sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1st,Male,,0,5,118,57
1st,Female,,0,1,4,140
2nd,Male,,0,11,154,14
2nd,Female,,0,13,13,80
3rd,Male,,35,13,387,75
3rd,Female,,17,14,89,76
Crew,Male,,0,0,670,192
Crew,Female,,0,0,3,20


In [12]:
(pd.read_excel('titanic.xlsx', header = [0,1], index_col=[0,1])
 .rename_axis(columns=["Adult_or_Child","Survived"])
 .dropna(how='all',axis=1)
 .stack(["Adult_or_Child","Survived"])
 .reset_index(name="Numbers")
)

Unnamed: 0,Class,Sex,Adult_or_Child,Survived,Numbers
0,1st,Male,Adult,No,118
1,1st,Male,Adult,Yes,57
2,1st,Male,Child,No,0
3,1st,Male,Child,Yes,5
4,1st,Female,Adult,No,4
5,1st,Female,Adult,Yes,140
6,1st,Female,Child,No,0
7,1st,Female,Child,Yes,1
8,2nd,Male,Adult,No,154
9,2nd,Male,Adult,Yes,14


## Excel Tables: 

In [13]:
#read excel file
wb  = load_workbook(filename = "names.xlsx")

#read specific sheet
ws = wb['defined_tables']

named_tables = {}
for table_name, value in ws.tables.items():
    dataframe = ws[value]
    header, *body = [[cell.value for cell in row] for row in dataframe]
    dataframe = pd.DataFrame(body, columns = header)
    named_tables[table_name] = dataframe

In [14]:
named_tables

{'naija':        Name  Age  Height
 0      Tolu   24     2.0
 1  Chukwuka   50     1.8
 2      Ogor   15     1.5,
 'olympus':        Name  Age  Height
 0      zeus   65     1.9
 1  poseidon   26     2.2
 2    athena   30     1.4,
 'vikings':      Name  Age  Height
 0  ragnar   32     1.6
 1   bjorn   49     2.2
 2    loki  400     1.8}

In [15]:
pd.concat(named_tables).droplevel(-1).rename_axis(index="Table_name").reset_index()

Unnamed: 0,Table_name,Name,Age,Height
0,naija,Tolu,24,2.0
1,naija,Chukwuka,50,1.8
2,naija,Ogor,15,1.5
3,olympus,zeus,65,1.9
4,olympus,poseidon,26,2.2
5,olympus,athena,30,1.4
6,vikings,ragnar,32,1.6
7,vikings,bjorn,49,2.2
8,vikings,loki,400,1.8


In [16]:
#complete code :

#read excel file
wb  = load_workbook(filename = "names.xlsx")

#read specific sheet
ws = wb['defined_tables']

named_tables = {}
for table_name, value in ws.tables.items():
    dataframe = ws[value]
    header, *body = [[cell.value for cell in row] for row in dataframe]
    dataframe = pd.DataFrame(body, columns = header)
    named_tables[table_name] = dataframe
    
(pd.concat(named_tables)
 .droplevel(-1)
 .rename_axis(index="Sheet_name")
 .reset_index())

Unnamed: 0,Sheet_name,Name,Age,Height
0,naija,Tolu,24,2.0
1,naija,Chukwuka,50,1.8
2,naija,Ogor,15,1.5
3,olympus,zeus,65,1.9
4,olympus,poseidon,26,2.2
5,olympus,athena,30,1.4
6,vikings,ragnar,32,1.6
7,vikings,bjorn,49,2.2
8,vikings,loki,400,1.8


## Conditional Formatting

In [17]:
#read excel file
wb  = load_workbook(filename = "names.xlsx")

#read specific sheet
ws = wb["olympus"]

data = defaultdict(list)
for column in ws.columns:
    for cell in column:
        if cell.data_type == "n":
            cell_value = ",".join((str(cell.value), cell.fill.fgColor.rgb))
            data[f"C{cell.column}"].append(cell_value)
        else:
            data[f"C{cell.column}"].append(cell.value)
            
data

defaultdict(list,
            {'C1': ['Name', 'zeus', 'poseidon', 'athena'],
             'C2': ['Age', '65,00000000', '26,FFFFFF00', '30,00000000'],
             'C3': ['Height', '1.9,00000000', '2.2,00000000', '1.4,FF00A933']})

In [18]:
(pd.DataFrame(data)
 .row_to_names(row_number = 0,remove_row = True) #pyjanitor
 .set_index("Name")
 .stack()
 .str.split(",",expand=True)
 .set_axis(["Value","Colour"], axis="columns")
 .rename_axis(index=["Name","Variable"])
 .reset_index()
)

Unnamed: 0,Name,Variable,Value,Colour
0,zeus,Age,65.0,00000000
1,zeus,Height,1.9,00000000
2,poseidon,Age,26.0,FFFFFF00
3,poseidon,Height,2.2,00000000
4,athena,Age,30.0,00000000
5,athena,Height,1.4,FF00A933


## Sheets with comments :

In [19]:
#read excel file
wb  = load_workbook(filename = "names.xlsx")

#read specific sheet
ws = wb["vikings"]

data = defaultdict(list)
comments = {}
for column in ws.columns:
    for cell in column:
        if cell.comment:
            comments[cell.row] = cell.comment.text
        data[cell.row].append(cell.value)
            
data

defaultdict(list,
            {1: ['Name', 'Age', 'Height'],
             2: ['ragnar', 32, 1.6],
             3: ['bjorn', 49, 2.2],
             4: ['loki', 400, 1.8]})

In [20]:
comments

{2: 'Medieval warlord', 4: 'Not sure about the age'}

In [21]:
new_data = {}
for key, value in data.items():
    value.append(comments.get(key, None))
    new_data[key] = value
    
new_data

{1: ['Name', 'Age', 'Height', None],
 2: ['ragnar', 32, 1.6, 'Medieval warlord'],
 3: ['bjorn', 49, 2.2, None],
 4: ['loki', 400, 1.8, 'Not sure about the age']}

In [22]:
pd.DataFrame(new_data.values()).fillna("comments", limit=1).row_to_names(0,True)

Unnamed: 0,Name,Age,Height,comments
1,ragnar,32,1.6,Medieval warlord
2,bjorn,49,2.2,
3,loki,400,1.8,Not sure about the age


In [23]:
#complete code : 

wb  = load_workbook(filename = "names.xlsx")
ws = wb["vikings"]

data = defaultdict(list)

comments = {}
for column in ws.columns:
    for cell in column:
        if cell.comment:
            comments[cell.row] = cell.comment.text
        data[cell.row].append(cell.value)

new_data = {}
for key, value in data.items():
    value.append(comments.get(key, None))
    new_data[key] = value
    
(pd.DataFrame(new_data.values()).
 fillna("comments", limit=1)
 .row_to_names(0,True))

Unnamed: 0,Name,Age,Height,comments
1,ragnar,32,1.6,Medieval warlord
2,bjorn,49,2.2,
3,loki,400,1.8,Not sure about the age


## Small Multiples : 

In [24]:
#read excel file
wb  = load_workbook(filename = "enron.xlsx")

#read specific sheet
ws = wb["Report"]

rows = set()
columns = set()
data = []
for row in ws.rows:
    for cell in row:
        if cell.value in ("BID", "OFFER") :
            rows.add(cell.row)
            columns.add(cell.column)
        data.append((cell.value, cell.row, cell.column))

In [25]:
rows

{16, 27, 38, 49}

In [26]:
rows = [range(n-2, n+9) for n in sorted(rows)]
rows

[range(14, 25), range(25, 36), range(36, 47), range(47, 58)]

In [27]:
columns

{7, 8, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20}

In [28]:
columns = sorted(columns)
columns = [columns[slice(n,n+4)] for n in range(0, len(columns), 4)]

columns

[[7, 8, 9, 10], [12, 13, 14, 15], [17, 18, 19, 20]]

In [29]:
from itertools import product

data_boundary = defaultdict(list)

for (value, row, column), row_range, column_list in product(data, rows, columns):
    if row in row_range and column in column_list:
        data_boundary[(row_range, tuple(column_list))].append(value)

In [30]:
numpy_data = np.vstack([np.reshape(value,(-1,4)) 
                        for key, value
                        in data_boundary.items()])

df = pd.DataFrame(numpy_data).remove_empty().add_prefix("A_")

In [31]:
df.head(10)

Unnamed: 0,A_0,A_1,A_2,A_3
0,IF NWPL Rocky Mountains,,,
1,Fixed Price,,Basis,
2,BID,OFFER,BID,OFFER
3,1.89,1.91,,
4,2.06,2.08,,
5,2.395,2.415,-0.565,-0.545
6,2.594,2.614,-0.49375,-0.47375
7,2.58129,2.60129,-0.585,-0.565
8,3.356,3.376,-0.295,-0.275
9,2.63408,2.65408,-0.530417,-0.510417


In [32]:
(df
 .assign(company_name = np.where(df.isna().sum(1)==3, 
                                 df.iloc[:,0], 
                                 None))
 # pyjanitor
 .fill_direction({"company_name":"down"})
 .set_axis(["Fixed_Price_Bid", "Fixed_Price_Offer", 
            "Basis_Bid", "Basis_Offer", "company_name"], 
           axis='columns')
  .query("Fixed_Price_Bid != company_name and Fixed_Price_Bid != ['Fixed Price', 'BID']")
 # pyjanitor
 .reorder_columns(["company_name"])
 .reset_index(drop=True)
)

Unnamed: 0,company_name,Fixed_Price_Bid,Fixed_Price_Offer,Basis_Bid,Basis_Offer
0,IF NWPL Rocky Mountains,1.89,1.91,,
1,IF NWPL Rocky Mountains,2.06,2.08,,
2,IF NWPL Rocky Mountains,2.395,2.415,-0.565,-0.545
3,IF NWPL Rocky Mountains,2.594,2.614,-0.49375,-0.47375
4,IF NWPL Rocky Mountains,2.58129,2.60129,-0.585,-0.565
...,...,...,...,...,...
65,PG&E City Gate,2.88,2.9,-0.08,-0.06
66,PG&E City Gate,3.01525,3.03525,-0.0725,-0.0525
67,PG&E City Gate,3.26129,3.28129,0.095,0.115
68,PG&E City Gate,3.959,3.979,0.308,0.328


In [33]:
# complete code : 
from itertools import product

wb  = load_workbook(filename = "enron.xlsx")
ws = wb["Report"]

rows = set()
columns = set()

data = []
for row in ws.rows:
    for cell in row:
        if cell.value in ("BID", "OFFER") :
            rows.add(cell.row)
            columns.add(cell.column)
        data.append((cell.value, cell.row, cell.column))
        
rows = [range(n-2, n+9) for n in sorted(rows)]

columns = sorted(columns)
columns = [columns[slice(n,n+4)] for n in range(0, len(columns), 4)]

data_boundary = defaultdict(list)
for (value, row, column), row_range, column_list in product(data, rows, columns):
    if row in row_range and column in column_list:
        data_boundary[(row_range, tuple(column_list))].append(value)
        
numpy_data = np.vstack([np.reshape(value,(-1,4)) 
                        for key, value 
                        in data_boundary.items()])

df = (pd.DataFrame(numpy_data)
     .remove_empty()
     .add_prefix("A_")
     .assign(company_name = lambda df: np.where( df.isna().sum(1)==3, 
                                                 df.iloc[:,0], 
                                                 None))
     .fill_direction({"company_name":"down"})
     .set_axis(["Fixed_Price_Bid", "Fixed_Price_Offer", 
                "Basis_Bid", "Basis_Offer", "company_name"], 
               axis='columns')
      .query("Fixed_Price_Bid != company_name and Fixed_Price_Bid != ['Fixed Price', 'BID']")
     .reorder_columns(["company_name"])
     .reset_index(drop=True)
    )

In [34]:
df.head(21)

Unnamed: 0,company_name,Fixed_Price_Bid,Fixed_Price_Offer,Basis_Bid,Basis_Offer
0,IF NWPL Rocky Mountains,1.89,1.91,,
1,IF NWPL Rocky Mountains,2.06,2.08,,
2,IF NWPL Rocky Mountains,2.395,2.415,-0.565,-0.545
3,IF NWPL Rocky Mountains,2.594,2.614,-0.49375,-0.47375
4,IF NWPL Rocky Mountains,2.58129,2.60129,-0.585,-0.565
5,IF NWPL Rocky Mountains,3.356,3.376,-0.295,-0.275
6,IF NWPL Rocky Mountains,2.63408,2.65408,-0.530417,-0.510417
7,IF CIG Rocky Mountains,1.94,1.96,,
8,IF CIG Rocky Mountains,1.96,1.98,,
9,IF CIG Rocky Mountains,2.345,2.365,-0.615,-0.595
