# La calidad de los datos suele no ser la mejor.

### Un ejemplo tomando datos de U.S. Energy Information Administration

Fuente de los datos: [eia.gov](http://www.eia.gov/coal/data.cfm)

¿Que hacer?

1. **Revisar los datos** Siempre asuman que los datos tienen errores.
2. **Limpiar y estandarizar** Es necesario procesar los datos para que sean homogeneos y confiables.

In [None]:
%matplotlib inline

In [None]:
import glob

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import IFrame
from xml.sax import ContentHandler, parse

In [None]:
IFrame("http://www.eia.gov/coal/data.cfm", width=700, height=350)

In [None]:
df_test = pd.read_excel('../sources/coalpublic2013.xls')
df_test.head()

In [None]:
df_test = pd.read_excel("../sources/coalpublic2013.xls", skiprows=3)
df_test.head()

In [None]:
df_test = pd.read_excel("../sources/coalpublic2002.xls", skiprows=3)
df_test.head()

In [None]:
# Reference https://goo.gl/KaOBG3
class ExcelHandler(ContentHandler):
    def __init__(self):
        self.chars = []
        self.cells = []
        self.rows = []
        self.tables = []
        
    def characters(self, content):
        self.chars.append(content)
        
    def startElement(self, name, atts):
        if name=="Cell":
            self.chars = []
            
        elif name=="Row":
            self.cells=[]
            
        elif name=="Table":
            self.rows = []
            
    def endElement(self, name):
        if name=="Cell":
            self.cells.append(''.join(self.chars))
            
        elif name=="Row":
            self.rows.append(self.cells)
            
        elif name=="Table":
            self.tables.append(self.rows)


def create_file(org_file_addr, df):
    new_addr = 'sources/coal_prod_' + org_file_addr[-8:-4] + '.xlsx'
    df.to_excel(new_addr)

    
def transform_xlrdfiles(file_name):
    excelHandler = ExcelHandler()
    parse(file_name, excelHandler)
    df = pd.DataFrame(excelHandler.tables[0][4:], columns=excelHandler.tables[0][3])
    create_file(file_name, df)

In [None]:
lista = glob.glob('../sources/*.xls')
for file in lista:
    print('Procesando:', file)
    try:
        df = pd.read_excel(file, skiprows=3)
        create_file(file, df)
        print(' ...Ok')

    except:
        transform_xlrdfiles(file)
        print(' ...Ok')
    
print('\n¡Proceso Terminado!')

In [None]:
df2 = pd.read_excel("sources/coal_prod_2002.xlsx", index_col=False)
df3 = pd.read_excel("sources/coal_prod_2003.xlsx", index_col=False)
df4 = pd.read_excel("sources/coal_prod_2004.xlsx", index_col=False)
df5 = pd.read_excel("sources/coal_prod_2005.xlsx", index_col=False)
df6 = pd.read_excel("sources/coal_prod_2006.xlsx", index_col=False)
df7 = pd.read_excel("sources/coal_prod_2007.xlsx", index_col=False)
df8 = pd.read_excel("sources/coal_prod_2008.xlsx", index_col=False)
df9 = pd.read_excel("sources/coal_prod_2009.xlsx", index_col=False)
df10 = pd.read_excel("sources/coal_prod_2010.xlsx", index_col=False)
df11 = pd.read_excel("sources/coal_prod_2011.xlsx", index_col=False)
df12 = pd.read_excel("sources/coal_prod_2012.xlsx", index_col=False)
df13 = pd.read_excel("sources/coal_prod_2013.xlsx", index_col=False)
df14 = pd.read_excel("sources/coal_prod_2014.xlsx", index_col=False)
df15 = pd.read_excel("sources/coal_prod_2015.xlsx", index_col=False)

In [None]:
df = pd.concat((df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15))

In [None]:
df.head()

In [None]:
df.drop(['Coal Supply Region', 'Year'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
new_columns = {
                'Average Employees': 'production_short_tons',
                'Company Type': 'mine_type',
                'Labor Hours': 'average_employees',
                'MSHA ID': 'year',
                'Mine Basin': 'mine_county',
                'Mine County': 'mine_state',
                'Mine Name': 'MSHA_ID',
                'Mine State': 'mine_name',
                'Mine Status': 'mine_basin',
                'Mine Type': 'mine_status',
                'Operating Company': 'operation_type',
                'Operating Company Address': 'operating_company',
                'Operation Type': 'company_type',
                'Production (short tons)': 'union_code',
                'Union Code': 'operating_company_address',
              }
df.rename(inplace=True, columns=new_columns)

In [None]:
df.head()

In [None]:
df.set_index('MSHA_ID', inplace=True)

In [None]:
df.head()

In [None]:
# Mirando los datos, creo que hay un error tipografico.
df['company_type'].unique()

In [None]:
# Corrijamos el valor equivocado.
df.loc[df['company_type'] == 'Indepedent Producer Operator', 'company_type'] = 'Independent Producer Operator'
df.head()

# Y para terminar:

## Un producto terminado

Un conjunto de datos limpio (mayormente), ordenado y listo para ser analizado.

In [None]:
df.to_excel("sources/coal_prod_cleaned.xlsx")