 <h1><center> Extracting Data From Excel with Python</center></h1>
  
 <h4><center>Speaker : Samuel Oranyeli</center></h4>
 
  <h4><center>Blog : samukweku.github.io</center></h4>
  
  <h4><center>Github : github.com/samukweku</center></h4>
  
   <h4><center>Twitter : Twitter.com/samukweku</center></h4>









## Issues with Excel:

- Can be frustrating sometimes, especially when data is mixed with formulas.

- Solution to a problem can not be shared easily, as the solutions are in cells.

- Verison control is not possible (not that I know of)

- Limited automation.

- Tables nested in sheets and located at different points.

- Data in pivot format, with multiple headers.

## Case : Convert Pivot Table to Long Format
#### Source : [Tich Mangono](https://github.com/tichmangono/python_and_excel)
![basetable.png](attachment:basetable.png)

## Introspection
![base_table_annotated.png](attachment:base_table_annotated.png)

In [1]:
#import libraries
import pandas as pd
import numpy as np

#read in file

file = 'reshaping_data.xlsx'

df = (pd
      .read_excel(file, 
                  skiprows=7,
                  header=None)
      .dropna(how='all',
              axis=1)
     )


#replace null values with empty string

df = df.replace(np.nan,'')


df.head(10)

Unnamed: 0,1,2,3,4,7,8,9,10,12,13,...,19,20,22,23,24,25,27,28,29,30
0,district,province,partner,funding_source,2017,2017,2017,2017,2018,2018,...,2019,2019,2020,2020,2020,2020,2021,2021,2021,2021
1,,,,,10-14yrs,15-29yrs,30+yrs,Total,10-14yrs,15-29yrs,...,30+yrs,Total,10-14yrs,15-29yrs,30+yrs,Total,10-14yrs,15-29yrs,30+yrs,Total
2,District 1,Region 1,partner 1,Souce 2,1296,383,1571,3250,189,854,...,491,2256,1906,1925,931,5465,61,353,1091,2409
3,District 2,Region 3,partner 6,Souce 5,722,232,1848,2802,972,69,...,245,2957,810,664,452,3665,989,374,1790,4320
4,District 3,Region 1,partner 1,Souce 2,545,585,1736,2866,1048,1261,...,503,2574,1890,736,1414,5311,1215,112,1475,2824
5,District 4,Region 3,partner 6,Souce 5,631,1413,31,2075,950,409,...,1701,6296,1646,960,209,2956,1392,936,701,4903
6,District 5,Region 2,partner 2,Souce 6,1468,1490,1971,4929,1683,907,...,182,3361,251,1032,1377,3142,1241,1653,1371,4345
7,District 6,Region 6,partner 4,Souce 3,977,778,974,2729,1788,798,...,797,3056,647,780,918,2722,1227,389,914,4105
8,District 7,Region 4,partner 3,Souce 4,1629,333,1237,3199,1905,1193,...,390,3410,1847,1688,831,6265,1014,1986,1556,6028
9,District 8,Region 5,partner 6,Souce 1,1502,982,251,2735,367,1406,...,1140,3484,605,1916,632,4320,701,859,346,2759


In [2]:
#set the columns for the dataframe

df.columns = (df
              .copy()
              .iloc[:2]
              .astype(str)
              .add(',')
              .sum(axis=0)
              .str
              .rstrip(',')
             )

df.head(2)

Unnamed: 0,district,province,partner,funding_source,"2017,10-14yrs","2017,15-29yrs","2017,30+yrs","2017,Total","2018,10-14yrs","2018,15-29yrs",...,"2019,30+yrs","2019,Total","2020,10-14yrs","2020,15-29yrs","2020,30+yrs","2020,Total","2021,10-14yrs","2021,15-29yrs","2021,30+yrs","2021,Total"
0,district,province,partner,funding_source,2017,2017,2017,2017,2018,2018,...,2019,2019,2020,2020,2020,2020,2021,2021,2021,2021
1,,,,,10-14yrs,15-29yrs,30+yrs,Total,10-14yrs,15-29yrs,...,30+yrs,Total,10-14yrs,15-29yrs,30+yrs,Total,10-14yrs,15-29yrs,30+yrs,Total


In [3]:
(df
 .melt(id_vars=['district',
                'province',
                'partner',
                'funding_source',])
 .loc[lambda x: ~x['district'].isin(["","district"])]
 .loc[lambda x: ~x['variable'].str.contains('Total')]
 .assign(Year = lambda x: x['variable'].str.split(',').str[0],
         Age_Range = lambda x: x['variable'].str.split(',').str[-1]
         )
 .drop('variable',axis=1)
)

Unnamed: 0,district,province,partner,funding_source,value,Year,Age_Range
2,District 1,Region 1,partner 1,Souce 2,1296,2017,10-14yrs
3,District 2,Region 3,partner 6,Souce 5,722,2017,10-14yrs
4,District 3,Region 1,partner 1,Souce 2,545,2017,10-14yrs
5,District 4,Region 3,partner 6,Souce 5,631,2017,10-14yrs
6,District 5,Region 2,partner 2,Souce 6,1468,2017,10-14yrs
...,...,...,...,...,...,...,...
223,District 6,Region 6,partner 4,Souce 3,914,2021,30+yrs
224,District 7,Region 4,partner 3,Souce 4,1556,2021,30+yrs
225,District 8,Region 5,partner 6,Souce 1,346,2021,30+yrs
226,District 9,Region 2,partner 2,Souce 6,1802,2021,30+yrs


### Cleanup Complete : Final Phase is to create a function and apply to all the sheets

In [4]:
def remove_rows(dataframe, column, check):
    '''
    Filters out rows,
    where values in a column
    are not found in a supplied list (check)
    or does not contain the supplied string (check)
    and returns a dataframe.
    '''
    
    if isinstance(check,list):
        
        condition = dataframe.loc[:, column].isin(check)
    
    elif isinstance(check,str):
        
        condition = dataframe.loc[:, column].str.contains(check)
    
    dataframe = dataframe.loc[~condition]
    
    return dataframe

In [5]:
def split_create_new_columns(dataframe,source_column,new_column_1,new_column_2):
    '''
    Splits the source_column,
    assigns it to two new columns,
    removes the source_column
    and returns a dataframe.
    '''
    
    dataframe[[new_column_1,new_column_2]] = (dataframe
                                              .copy()
                                              .loc[:,source_column]
                                              .str
                                              .split(',', expand=True)
                                             )
    
    dataframe = dataframe.drop(source_column, axis = 1)
    
    return dataframe 

In [6]:
def process_data(dataframe):

    df = (dataframe
          .dropna(how='all',axis=1)
          .replace(np.nan,'')
            )

    df.columns = (df
                  .copy()
                  .iloc[:2]
                  .astype(str)
                  .add(',')
                  .sum(axis=0)
                  .str
                  .rstrip(',')
                  )

    df = (df
          .melt(id_vars=['district', 'province',
                         'partner', 'funding_source'])
          .pipe(remove_rows,'district',["","district"])
          .pipe(remove_rows, 'variable', 'Total')
          .pipe(split_create_new_columns,'variable','Target_Year','Target_Age')
          .rename({'value':'Target_Quantity'}, axis = 1)
          )
    
    
    return df

### The 'hard' part is done; let's apply our processed_data function to the Excel file.

In [7]:
all_sheets = pd.read_excel(file,sheet_name=None,header=None,skiprows=7)

processed = [process_data(v)
             .assign(main_organization=k)
            for k,v in all_sheets.items()]

results = pd.concat(processed, ignore_index = True)

results.head(20)

Unnamed: 0,district,province,partner,funding_source,Target_Quantity,Target_Year,Target_Age,main_organization
0,District 1,Region 1,partner 1,Souce 2,1296,2017,10-14yrs,ABC_inc
1,District 2,Region 3,partner 6,Souce 5,722,2017,10-14yrs,ABC_inc
2,District 3,Region 1,partner 1,Souce 2,545,2017,10-14yrs,ABC_inc
3,District 4,Region 3,partner 6,Souce 5,631,2017,10-14yrs,ABC_inc
4,District 5,Region 2,partner 2,Souce 6,1468,2017,10-14yrs,ABC_inc
5,District 6,Region 6,partner 4,Souce 3,977,2017,10-14yrs,ABC_inc
6,District 7,Region 4,partner 3,Souce 4,1629,2017,10-14yrs,ABC_inc
7,District 8,Region 5,partner 6,Souce 1,1502,2017,10-14yrs,ABC_inc
8,District 9,Region 2,partner 2,Souce 6,771,2017,10-14yrs,ABC_inc
9,District 10,Region 1,partner 1,Souce 2,1238,2017,10-14yrs,ABC_inc
