In [1]:
import pandas as pd
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

## Table description generation

Enter the following info: 
 - Table name
 - Location
 - Separator
 - Encoding (optional)
 - Decimal mark (optional)

In [2]:
table = "TB_INTERACCIONES.tsv"
location = "../../data/raw"
sep = '\t'
encoding = 'latin1'
decimal = ','

### Make a first view of the dataset to check most interesting columns

**Run this if it's a big file**

In [3]:
for chunk in pd.read_csv(f"{location}/{table}",
                         sep=sep,
                         encoding=encoding,
                         decimal=decimal,
                         chunksize=1000000):
    df = chunk
    break

**Run this if it's a relatively small file**

In [None]:
df = pd.read_csv(f"{location}/{table}",
                sep=separator,
                encoding=encoding,
                decimal=decimal)

In [4]:
df.head(15)

Unnamed: 0,ID,CIF_ID,IN_OUT,CANAL_COMUNICACION,FECHA,OBSERVACIONES,GESTION,DESCRIPCION,AGRUPACION
0,212931236,22118197,O,MAIL,01/01/2018,,FPSE,Emailing Falta de Pago - Enviado,
1,212931237,21794535,O,MAIL,01/01/2018,,FPSE,Emailing Falta de Pago - Enviado,
2,212931238,12872524,O,MAIL,01/01/2018,,FPSE,Emailing Falta de Pago - Enviado,
3,212931239,1089433,O,MAIL,01/01/2018,,FPNE,Emailing Falta de Pago - No enviado,
4,212931240,11176325,O,MAIL,01/01/2018,,FPNE,Emailing Falta de Pago - No enviado,
5,212931241,22264538,O,MAIL,01/01/2018,,FPNE,Emailing Falta de Pago - No enviado,
6,212958562,21056268,O,TEL,01/01/2018,4560-0029756-07 AUTO 05-01-2018 Impresión/PDF...,H038,Impresión/PDF Automotor,CONSULTAS O GESTIONES SOBRE POLIZA
7,212931226,10037395,O,MAIL,01/01/2018,,FPNE,Emailing Falta de Pago - No enviado,
8,212931227,10801030,O,MAIL,01/01/2018,,FPSE,Emailing Falta de Pago - Enviado,
9,212931228,17500849,O,MAIL,01/01/2018,,FPSE,Emailing Falta de Pago - Enviado,


In [5]:
df.dtypes

ID                    object
CIF_ID                object
IN_OUT                object
CANAL_COMUNICACION    object
FECHA                 object
OBSERVACIONES         object
GESTION               object
DESCRIPCION           object
AGRUPACION            object
dtype: object

In [6]:
df.columns

Index(['ID', 'CIF_ID', 'IN_OUT', 'CANAL_COMUNICACION', 'FECHA',
       'OBSERVACIONES', 'GESTION', 'DESCRIPCION', 'AGRUPACION'],
      dtype='object')

*Based on last output, fill this list to mark most relevant columns*

In [3]:
to_use = ['ID', 'CIF_ID', 'IN_OUT', 'CANAL_COMUNICACION', 'FECHA',
       'OBSERVACIONES', 'GESTION', 'DESCRIPCION', 'AGRUPACION']

### Now write the file

**If it was a big file, read it completely with this line**

In [4]:
chunks = pd.read_csv(f"{location}/{table}",
                         sep=sep,
                         encoding=encoding,
                         decimal=decimal,
                         chunksize=1000000)
df = pd.concat(chunks)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
f = open(f'../../docs/{table} feature description.csv','w')
f.write('Column;Used;Null Rate; Type; Unique values; Values\n')
for column in df.columns:
    print(column)
    null_rate = round(df[column].isna().mean()*100,2)
    
    unique_vals = df[column].nunique()
    
    if (column in to_use) and null_rate < .5 and unique_vals > 1:
        used = 'X'
    else:
        used=''
    
    dtype = df[column].dtype
    
    if(dtype == 'object'):
        values = f"Top 10:\n{df[column].value_counts(dropna=False).head(10).to_string()}"
    else:
        values = f'[{df[column].min()};{df[column].max()}]'
        
    f.write(f'{column};{used};{null_rate};{dtype};{unique_vals};"{values}"\n')
    
f.close()


ID
CIF_ID
IN_OUT
CANAL_COMUNICACION
FECHA
OBSERVACIONES
GESTION
DESCRIPCION
AGRUPACION
