In [1]:
import pandas as pd
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

## Table description generation

Enter the following info: 
 - Table name
 - Location
 - Separator
 - Encoding (optional)
 - Decimal mark (optional)

In [2]:
table = "TB_PAGOS.tsv"
location = "../../data/raw"
sep = '\t'
encoding = 'latin1'
decimal = ','

### Make a first view of the dataset to check most interesting columns

**Run this if it's a big file**

In [4]:
for chunk in pd.read_csv(f"{location}/{table}",
                         sep=sep,
                         encoding=encoding,
                         decimal=decimal,
                         chunksize=1000000):
    df = chunk
    break

  interactivity=interactivity, compiler=compiler, result=result)


**Run this if it's a relatively small file**

In [3]:
df = pd.read_csv(f"{location}/{table}",
                sep=sep,
                encoding=encoding,
                decimal=decimal)

In [4]:
df.head(15)

Unnamed: 0,COD_CIA,COD_SECC,FECHA_EQUIPO,NUM_POL1,NUM_SECU_POL,NUM_END,FECHA_VTO,FEC_PAGO,MONTO_PAGO,COD_COBRO,COD_SITUACION
0,2,90,19/03/2019,5010000625601,9609992,,2019-03,2019-03,1026.48,CC,CT
1,2,90,11/03/2019,5000000104101,1799992,,2018-06,2019-04,33.84,CC,CT
2,2,90,13/03/2019,5170826702201,11116689999,,2019-03,2019-04,22.56,CC,CT
3,2,90,08/03/2019,5130005681801,91449992,,2019-03,2019-03,33.84,CC,CT
4,2,11,01/03/2019,5000612704001,360559968,,2019-03,2019-03,130.0,TA,CT
5,2,11,01/03/2019,5000612711801,361269968,,2019-03,2019-03,100.0,TA,CT
6,2,11,01/03/2019,5000612713501,361439968,,2019-03,2019-03,100.0,TA,CT
7,2,11,01/03/2019,5000612716701,361709968,,2019-04,2019-03,100.0,TA,CT
8,2,11,01/03/2019,5000612724501,362369968,,2019-04,2019-03,23.7,TA,CT
9,2,90,09/03/2019,5280004902301,71909992,,2019-03,2019-03,507.6,CC,CT


In [5]:
df.dtypes

COD_CIA            int64
COD_SECC           int64
FECHA_EQUIPO      object
NUM_POL1           int64
NUM_SECU_POL       int64
NUM_END          float64
FECHA_VTO         object
FEC_PAGO          object
MONTO_PAGO       float64
COD_COBRO         object
COD_SITUACION     object
dtype: object

In [6]:
df.columns

Index(['COD_CIA', 'COD_SECC', 'FECHA_EQUIPO', 'NUM_POL1', 'NUM_SECU_POL',
       'NUM_END', 'FECHA_VTO', 'FEC_PAGO', 'MONTO_PAGO', 'COD_COBRO',
       'COD_SITUACION'],
      dtype='object')

*Based on last output, fill this list to mark most relevant columns*

In [8]:
to_use = ['COD_CIA', 'COD_SECC', 'FECHA_EQUIPO', 'NUM_POL1', 'NUM_SECU_POL',
       'NUM_END', 'FECHA_VTO', 'FEC_PAGO', 'MONTO_PAGO', 'COD_COBRO',
       'COD_SITUACION']

### Now write the file

**If it was a big file, read it completely with this line**

In [5]:
chunks = pd.read_csv(f"{location}/{table}",
                         sep=sep,
                         encoding=encoding,
                         decimal=decimal,
                         chunksize=1000000)
df = pd.concat(chunks)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
df["FECHA_EQUIPO"].value_counts().head().to_string()

'06/04/2019    283655\n06/03/2019    239644\n06/05/2019    234558\n05/03/2019    233488\n01/04/2019    197252'

In [21]:
f = open(f'../../docs/{table} feature description.csv','w')
f.write('Column;Used;Null Rate; Type; Unique values; Values\n')
for column in df.columns:
    
    null_rate = round(df[column].isna().mean()*100,2)
    
    unique_vals = df[column].nunique()
    
    if (column in to_use) and null_rate < .5 and unique_vals > 1:
        used = 'X'
    else:
        used=''
    
    dtype = df[column].dtype
    
    if(dtype == 'object'):
        values = f"Top 10:\n{df[column].value_counts(dropna=False).head(10).to_string()}"
    else:
        values = f'[{df[column].min()};{df[column].max()}]'
        
    f.write(f'{column};{used};{null_rate};{dtype};{unique_vals};"{values}"\n')
    
f.close()