# Automating excel files to databases

In [150]:
import pandas as pd
import io
import requests
import re
import numpy as np
url="http://www.ico.org/historical/1990%20onwards/Excel/1a%20-%20Total%20production.xlsx"
#s=requests.get(url).content
c=pd.read_excel(url, header=3) #Read in data to pandas and eliminate columns before the true headers 

In [151]:
c.columns = ['Crop_year', 'beans']+[x for x in c.columns for x in re.findall(r'^\d{4}', x)] #Clean headers 

In [152]:
c = c.loc[c.iloc[:,2].isna()==False] #eliminate empty rows (white space)

In [153]:
check_digit = c.loc[c.Crop_year.str.contains('total', flags=re.I,regex=True),'1990':].sum() #used to tie out once finished

In [154]:
c = c.loc[~c.Crop_year.str.contains('total', flags=re.I,regex=True),:] #eliminate totals columns

In [155]:
c.insert(1, 'Harvest_Month', np.nan)

In [156]:
x = c[c.beans.isna()==True].index #Get indexes of subheaders
c.loc[x,'Harvest_Month'] = c.Crop_year.loc[c.beans.isna()==True] #move subheaders to new column 'Harvest_Month'
c.Harvest_Month = c.Harvest_Month.fillna(method='ffill') #forward fill harvest month assignments
c.Harvest_Month = [x.split(' ')[0] for x in c.Harvest_Month] #remove the word 'group' (optional step)
c = c.loc[~c.Crop_year.str.contains('group', flags=re.I,regex=True),:] #drop total rows containing subheader

In [180]:
c.Harvest_Month = [x.split(' ')[0] for x in c.Harvest_Month]

In [181]:
c

Unnamed: 0,Crop_year,Harvest_Month,beans,1990,1991,1992,1993,1994,1995,1996,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
2,Angola,April,(R/A),79.345,74.331,67.52,27.608,11.802,48.109,50.925,...,3.42,4.97,8.715,7.79,4.935,9.405,10.515,14.83,5.006,10.3874
3,Bolivia (Plurinational State of),April,(A),111.977,82.936,100.935,36.523,85.944,110.485,105.079,...,82.4751,69.7249,82.8354,54.7812,67.9122,46.3766,29.2191,20.9835,25.3112,22.5687
4,Brazil,April,(A/R),17862.6286,21808.4934,16752.3542,18760.9786,15958.047,13760.2022,17259.743,...,30254.812,34054.4102,32148.8289,29283.0012,32751.9664,37781.7669,37472.5876,33491.1784,30782.8635,37613.8836
5,Burundi,April,(A/R),412.393,762.491,671.646,352.87,580.127,464.07,185.636,...,172.937,350.7196,202.1328,405.9615,159.2177,245.55,274.1017,246.7933,195.1079,179.9206
6,Ecuador,April,(A/R),1627.778,1471.617,1287.687,1589.007,2155.766,1438.233,1512.914,...,1156.2849,1273.9798,1553.1144,1578.4024,1209.545,1089.0112,839.4926,889.8845,643.5744,446.0001
7,Indonesia,April,(R/A),6720.383,5584.196,5115.478,5302.288,4225.878,4349.762,6772.467,...,7989.6699,5647.5118,3644.9233,8970.0386,8700.9493,6679.2773,7985.4751,6891.0266,7761.3153,4717.5942
8,Madagascar,April,(R),726.447,679.213,719.651,521.359,474.539,631.976,667.374,...,44.9613,88.7233,134.8528,89.7094,174.4758,111.4544,55.2061,60.8227,43.7848,21.3764
9,Malawi,April,(A),103.628,122.286,129.161,75.111,87.112,81.235,57.65,...,15.922,15.9285,24.8264,21.8262,26.7944,23.7098,20.2015,18.516,13.4703,11.8493
10,Papua New Guinea,April,(A/R),1012.841,794.816,943.111,1028.481,1135.159,999.863,1082.348,...,1048.9942,880.0884,1417.6185,714.777,833.3204,795.6402,710.1818,1169.1449,731.9382,928.369
11,Paraguay,April,(A),168.627,84.653,49.291,55.046,37.689,14.843,15.563,...,0.0167,0.1965,0.0195,0.0065,0.0069,0.0725,0.0,0.0101,0.0024,0.0135


In [160]:
c.iloc[:,3:].sum()-check_digit

1990       0.0
1991       0.0
1992       0.0
1993       0.0
1994       0.0
1995       0.0
1996       0.0
1997       0.0
1998    -163.0
1999    -151.0
2000    -197.0
2001    -163.0
2002   -1113.0
2003   -1386.0
2004   -1146.0
2005    -622.0
2006    -614.0
2007    -621.0
2008    -620.0
2009    -610.0
2010    -450.0
2011    -500.0
2012   -1250.0
2013       0.0
2014       0.0
2015       0.0
2016       0.0
2017       0.0
2018       0.0
dtype: float64

In [161]:
c.iloc[:,3:].sum().sum()-3588429.34 #Manual sum of values in in excel to verify discrepency is not a result of the program

0.0010000006295740604

# Make a function and automate the rest of the files

In [162]:
Total_production = 'http://www.ico.org/historical/1990%20onwards/Excel/1a%20-%20Total%20production.xlsx'
domestic_consumption = 'http://www.ico.org/historical/1990%20onwards/Excel/1b%20-%20Domestic%20consumption.xlsx'
Exportable_production = 'http://www.ico.org/historical/1990%20onwards/Excel/1c%20-%20Exportable%20production.xlsx'
Gross_opening_stocks = 'http://www.ico.org/historical/1990%20onwards/Excel/1d%20-%20Gross%20Opening%20stocks.xlsx'
Exports = 'http://www.ico.org/historical/1990%20onwards/Excel/1e%20-%20Exports%20-%20crop%20year.xlsx'
filelist = [Total_production, domestic_consumption, Exportable_production, Gross_opening_stocks, Exports]
filenames = ['Total_production', 'domestic_consumption', 'Exportable_production', 'Gross_opening_stocks', 'Exports']

In [182]:
path = 'C:/Users/erler/OneDrive/Documents/Random Data Sets/coffee/Finished/Python_method/'
tables = {}
for j,g in enumerate(filelist):    
    c=pd.read_excel(g, header=3) #Read in data to pandas and eliminate columns before the true headers
    c.columns = ['Crop_year', 'beans']+[x for x in c.columns for x in re.findall(r'^\d{4}', x)] #Clean headers 
    c = c.loc[c.iloc[:,2].isna()==False] #eliminate empty rows (white space)
    check_digit = c.loc[c.Crop_year.str.contains('total', flags=re.I,regex=True),'1990':].sum() #used to tie out once finished
    c = c.loc[~c.Crop_year.str.contains('total', flags=re.I,regex=True),:] #eliminate totals columns
    c.insert(1, 'Harvest_Month', np.nan)
    x = c[c.beans.isna()==True].index #Get indexes of subheaders
    c.loc[x,'Harvest_Month'] = c.Crop_year.loc[c.beans.isna()==True] #move subheaders to new column 'Harvest_Month'
    c.Harvest_Month = c.Harvest_Month.fillna(method='ffill') #forward fill harvest month assignments
    c = c.loc[~c.Crop_year.str.contains('group', flags=re.I,regex=True),:] #drop total rows containing subheader
    print(filenames[j]+' variance:', c.iloc[:,3:].sum().sum()-check_digit.sum()) #check for variance   
    c.to_csv(path+str(filenames[j])+'.csv') #save to folder 
    tables[filenames[j]] = c #make dict of all tables

Total_production variance: -9606.0
domestic_consumption variance: -4809.0
Exportable_production variance: -4991.0
Gross_opening_stocks variance: 0.0
Exports variance: -113435.00639999984


In [166]:
tables['Total_production']

Unnamed: 0,Crop_year,Harvest_Month,beans,1990,1991,1992,1993,1994,1995,1996,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
2,Angola,April group,(R/A),50.345,79.331,77.52,32.608,76.802,62.109,70.925,...,13.42,34.97,28.715,32.79,34.935,39.405,40.515,44.83,35.006,40.3874
3,Bolivia (Plurinational State of),April group,(A),122.777,103.536,120.235,50.823,116.944,142.485,124.579,...,128.4751,117.2249,131.8354,105.2812,119.9122,99.8766,84.2191,77.9835,83.8112,82.5687
4,Brazil,April group,(A/R),27285.6286,27293.4934,34603.3542,28166.9786,28192.047,18060.2022,29196.743,...,43976.812,55428.4102,48591.8289,55418.0012,54688.9664,53304.7669,52870.5876,56788.1784,52739.8635,62924.8836
5,Burundi,April group,(A/R),487.393,667.199,620.238,393.354,664.143,433.98,400.969,...,111.613,352.9776,204.1328,405.9615,163.2177,247.55,274.1017,248.7933,202.1079,178.4206
6,Ecuador,April group,(A/R),1503.815,2123.824,1185.48,2069.007,2375.766,1888.233,1992.914,...,813.2849,853.9798,825.4144,828.1024,665.545,644.0112,644.4926,644.8845,623.5744,601.0001
7,Indonesia,April group,(R/A),7441.383,8493.196,5569.478,6743.288,5367.878,4573.429,8220.584,...,11379.6699,9128.5118,6888.9233,13070.0386,12900.9493,10946.2773,12585.4751,11541.0266,10852.3153,9417.5942
8,Madagascar,April group,(R),982.447,932.513,1121.684,441.859,641.372,785.009,849.008,...,456.9613,529.7233,584.8528,499.7094,584.4758,501.4544,425.2061,420.8227,408.7848,396.3764
9,Malawi,April group,(A),104.628,124.286,137.161,62.111,84.112,91.235,48.65,...,16.922,16.9285,25.8264,22.8262,27.7944,24.7098,21.2015,19.516,14.4703,12.8493
10,Papua New Guinea,April group,(A/R),962.841,746.816,900.111,1019.481,1138.659,1002.363,1089.348,...,1037.9942,866.5184,1413.8545,716.488,835.3594,797.5642,711.9818,1171.1449,733.9382,930.369
11,Paraguay,April group,(A),130.627,79.653,54.291,70.046,24.689,22.843,25.563,...,20.0167,20.1965,20.0195,20.0065,20.0069,20.0725,20.0,20.0101,20.0024,20.0135
