# Generic EDA for exploring new Datasets

Pandas-based for small datasets or samples

---

raul.arrabales@gmail.com - Jun '23

### Libs

In [8]:
# dataset manipulation
import pandas as pd 
import numpy as np

# PDF report generation
from fpdf import FPDF

### Misc Config

In [85]:
# Random seed
r_seed = 41

### Helper functions

In [152]:
def duoprint(text, pdf, marker=False):
    """ Prints the input both in the PDF and output
    """
    if (marker):
        pdf.set_font('Arial', 'B', 12)
        pdf.set_fill_color(r=248, g=230, b=145)
    else:
        pdf.set_font('Arial', '', 12)
    pdf.cell(w=0, h=10, txt=text, border=0, ln=1, fill=marker)
    print(text)

In [150]:
def blank_line(pdf):
    pdf.cell(w=0, h=10, txt="   ", border=0, ln=1)

### PDF Report formatting

In [154]:
# Document author
pdf_author = "raul.arrabales"

# Margin
pdf_margin = 10 

# Page width: Width of A4 is 210mm
pdf_width = 210 - 2*pdf_margin

# Cell height
pdf_cell_heigth = 50

# Create empty doc
pdf = FPDF()
pdf.add_page()

# Managing dates
from datetime import datetime

### Dataset loading

In [155]:
prolexitim_dataset_path = "data/Prolexitim_Jan23_En.csv"
df = pd.read_csv(prolexitim_dataset_path,header=0,delimiter=",")
# df.sample(4)

In [156]:
pdf.set_font('Arial', 'B', 14)
pdf.cell(w=0, h=14, txt=
         "EDA for dataset: " + prolexitim_dataset_path, border=1, ln=1)
pdf.set_font('Arial', '', 12)
pdf.cell(w=0, h=8, txt=
         "Generated: " + datetime.now().strftime("%d/%m/%Y %H:%M:%S"), border=0, ln=1)
pdf.cell(w=0, h=8, txt=
         "By: " + pdf_author, border=0, ln=1)

In [171]:
print(f"{}",len(df['TAS20'].unique()) / n_rows * 100)

SyntaxError: f-string: empty expression not allowed (2699179606.py, line 1)

### Basic Dataset exploration

In [162]:
blank_line(pdf)
pdf.set_font('Arial', 'B', 12)
pdf.cell(w=0, h=10, txt="Dataset Dimensions", border=1, ln=1)

n_rows = len(df)
row_info = "Number of rows: " + str(n_rows)
print(row_info)
pdf.set_font('Arial', '', 12)
pdf.cell(w=0, h=8, txt=row_info, border=0, ln=1)

n_cols = df.shape[1]
col_info = "Number of columns: " + str(n_cols)
print(col_info)
pdf.cell(w=0, h=8, txt=col_info, border=0, ln=1)


Number of rows: 1011
Number of columns: 36


In [173]:
pdf.set_font('Arial', 'B', 12)
pdf.cell(w=0, h=10, txt="   ", border=0, ln=1)
pdf.cell(w=0, h=10, txt="Dataset Contents", border=1, ln=1)

for c in df.columns: 
    blank_line(pdf)
    
    col_name = "Column: " + c
    duoprint(col_name, pdf, marker=True)
    
    type_info = "Column Type: " + str(df[c].dtype)
    duoprint(type_info, pdf)
        
    sample_info = "Sample value: " + str(df[c].sample(n=1, ignore_index=True, random_state=r_seed).iloc[-1])
    duoprint(sample_info, pdf)
    
    num_unique = len(df[c].unique())
    num_unique_info = "Number of unique values: " + str(num_unique)
    duoprint(num_unique_info, pdf)
    
    pct_unique = (num_unique/n_rows)*100
    pct_unique_info = f'Percentage of unique values: {pct_unique:.2f}%'
    duoprint(pct_unique_info, pdf)
    
    print("---")

Column: Timestamp
Column Type: object
Sample value: 2021-03-17T06:09:29.827Z
Number of unique values: 1011
Percentage of unique values: 100.00%
---
Column: TAS20
Column Type: int64
Sample value: 74
Number of unique values: 65
Percentage of unique values: 6.43%
---
Column: F1
Column Type: int64
Sample value: 29
Number of unique values: 29
Percentage of unique values: 2.87%
---
Column: F2
Column Type: int64
Sample value: 22
Number of unique values: 21
Percentage of unique values: 2.08%
---
Column: F3
Column Type: int64
Sample value: 23
Number of unique values: 31
Percentage of unique values: 3.07%
---
Column: Gender
Column Type: int64
Sample value: 2
Number of unique values: 3
Percentage of unique values: 0.30%
---
Column: Age
Column Type: object
Sample value: 18
Number of unique values: 46
Percentage of unique values: 4.55%
---
Column: Code
Column Type: object
Sample value: 051447cdd1451f552c6cec85e29564d3
Number of unique values: 947
Percentage of unique values: 93.67%
---
Column: Alex

In [160]:
pdf.output(f'./EDA_report.pdf', 'F')

''

### Data Quality check

#### Label quality check (for supervised learning)

### Correlational Analysis