In [None]:
import pandas as pd
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

## Table description generation

Enter the following info: 
 - Table name
 - Location
 - Separator
 - Encoding (optional)
 - Decimal mark (optional)

In [None]:
table = "PAGOS.tsv"
location = "../../data/raw"
sep = '\t'
encoding = 'latin1'
decimal = ','

### Make a first view of the dataset to check most interesting columns

**Run this if it's a big file**

In [None]:
for chunk in pd.read_csv(f"{location}/{table}",
                         sep=sep,
                         encoding=encoding,
                         decimal=decimal,
                         chunksize=1000000):
    df = chunk
    break

**Run this if it's a relatively small file**

In [None]:
df = pd.read_csv(f"{location}/{table}",
                sep=separator,
                encoding=encoding,
                decimal=decimal)

In [5]:
df.head(15)

Unnamed: 0,COD_CIA,COD_SECC,FECHA_EQUIPO,NUM_POL1,NUM_SECU_POL,NUM_END,FECHA_VTO,FEC_PAGO,MONTO_PAGO,COD_COBRO,COD_SITUACION
0,2,90,07/12/2017,5010004804401,69739992,,2017-12,2017-12,18.18,CC,CT
1,2,11,01/12/2017,5000612586201,349709968,,2017-12,2017-12,6.1,TA,CT
2,2,11,01/12/2017,5000612717001,361729968,,2017-12,2017-12,80.0,TA,CT
3,2,11,01/12/2017,5000612721401,362109968,,2018-01,2017-12,105.0,TA,CT
4,2,11,01/12/2017,5000612729001,362729968,,2017-12,2017-12,80.0,TA,CT
5,2,90,31/01/2018,5040006209501,107929992,,2018-02,2018-02,27.27,CC,CT
6,2,90,01/12/2017,5230004535501,63779992,,2017-12,2017-12,9.09,CC,CT
7,2,22,12/01/2018,5000199230001,7809997,,2018-01,,-24.0,CC,EP
8,2,11,01/12/2017,5000612947501,377919968,,2017-12,2017-12,80.0,TA,CT
9,2,11,01/12/2017,5000613359101,398229968,,2017-12,2017-12,80.0,TA,CT


In [6]:
df.dtypes

COD_CIA            int64
COD_SECC           int64
FECHA_EQUIPO      object
NUM_POL1           int64
NUM_SECU_POL       int64
NUM_END          float64
FECHA_VTO         object
FEC_PAGO          object
MONTO_PAGO       float64
COD_COBRO         object
COD_SITUACION     object
dtype: object

In [7]:
df.columns

Index(['COD_CIA', 'COD_SECC', 'FECHA_EQUIPO', 'NUM_POL1', 'NUM_SECU_POL',
       'NUM_END', 'FECHA_VTO', 'FEC_PAGO', 'MONTO_PAGO', 'COD_COBRO',
       'COD_SITUACION'],
      dtype='object')

*Based on last output, fill this list to mark most relevant columns*

In [8]:
to_use = ['COD_CIA', 'COD_SECC', 'FECHA_EQUIPO', 'NUM_POL1', 'NUM_SECU_POL',
       'NUM_END', 'FECHA_VTO', 'FEC_PAGO', 'MONTO_PAGO', 'COD_COBRO',
       'COD_SITUACION']

### Now write the file

**If it was a big file, read it completely with this line**

In [9]:
chunks = pd.read_csv(f"{location}/{table}",
                         sep=sep,
                         encoding=encoding,
                         decimal=decimal,
                         chunksize=1000000)
df = pd.concat(chunks)

In [10]:
f = open(f'../../docs/{table} feature description.csv','w')
f.write('Column;Used;Null Rate; Type; Unique values; Values\n')
for column in df.columns:
    
    null_rate = round(df[column].isna().mean()*100,2)
    
    unique_vals = df[column].nunique()
    
    if (column in to_use) and null_rate < .5 and unique_vals > 1:
        used = 'X'
    else:
        used=''
    
    dtype = df[column].dtype
    
    if(dtype == 'object'):
        values = f"Top 10:\n{df[column].value_counts(dropna=False).head(10).to_string()}"
    else:
        values = f'[{df[column].min()};{df[column].max()}]'
        
    f.write(f'{column};{used};{null_rate};{dtype};{unique_vals};"{values}"\n')
    
f.close()
