# Missing values report
This notebook contains functions to create a report of missing values in each column of the dataset.

## Input and output files:
- **Input**: data/educatec_data/merged_educatec_moodle.csv
- **Output**: data/working_data/missing_values_report.csv

The format of the output CSV file is:

    column: Name of the column
    missing_count: Number of missing values
    missing_percentage: Percentage of missing values

In [6]:

import os
import pandas as pd
from pathlib import Path

def load_data(file_path):
    """
    Load the csv file from file_path
    :param file_path: Path - path to the csv file
    :return: pd.DataFrame - loaded dataframe
    """
    return pd.read_csv(file_path)

Identify and report the quantity and proportion of missing values in each column

In [7]:
def missing_values_report(df):
    # Replace 'Sin respuesta' and 'Sin fecha' with NaN
    df.replace(['Sin respuesta', 'Sin fecha', 0, '0',''], pd.NA, inplace=True)
    
    # Calculate missing data
    missing_data = df.isnull().sum().reset_index()
    missing_data.columns = ['Column', 'Missing Count']
    missing_data['Missing Percentage'] = ((missing_data['Missing Count'] / len(df)) * 100).round(2).astype(str) + '%'
    return missing_data

Create the report and save it

In [8]:
def reports(input_data_path,output_data_path):
    # Load data
    dataset = load_data(input_data_path)

    # Generate the report of missing values
    missing_report = missing_values_report(dataset)

    # Save the report to a csv file
    missing_report.to_csv(output_data_path, index=False)
    return missing_report

Configuramos rutas realtivas para los directorios

In [9]:
# Configurar rutas relativas para los directorios de datos
educatec_directory = os.path.join(os.getcwd(), "..", "..", "data", "educatec_data")
moodle_directory = os.path.join(os.getcwd(), "..", "..", "data", "moodle_data")
working_directory = os.path.join(os.getcwd(), "..", "..", "data", "working_data")

# Convertir a objetos Path y resolver las rutas
educatec_directory_path = Path(educatec_directory).resolve()
moodle_directory_path = Path(moodle_directory).resolve()
working_directory_path = Path(working_directory).resolve()

# Imprimir las rutas para verificar
print("Educatec Directory:", educatec_directory_path)
print("Moodle Directory:", moodle_directory_path)
print("Working Directory:", working_directory_path)

Educatec Directory: /Users/administrador/Downloads/Leaders4Edu/data/educatec_data
Moodle Directory: /Users/administrador/Downloads/Leaders4Edu/data/moodle_data
Working Directory: /Users/administrador/Downloads/Leaders4Edu/data/working_data


Main function

In [11]:

def main():
    # Paths
    base_data_path = Path("data")

    input_data_path = educatec_directory_path / "educatec.csv"
    output_data_path = working_directory_path / "missing_values_report_educatec.csv"
    reports(input_data_path,output_data_path)

    input_data_path = moodle_directory_path / "course_modules_completion.csv"
    output_data_path = working_directory_path / "missing_values_report_moodle_completion.csv"
    reports(input_data_path,output_data_path)

    input_data_path = moodle_directory_path / "course_modules.csv"
    output_data_path = working_directory_path / "missing_values_report_moodle_course.csv"
    reports(input_data_path,output_data_path)

    input_data_path = moodle_directory_path / "user_info_data.csv"
    output_data_path = working_directory_path / "missing_values_report_user_info_data.csv"
    reports(input_data_path,output_data_path)

    input_data_path = moodle_directory_path / "users.csv"
    output_data_path = working_directory_path / "missing_values_report_users.csv"
    reports(input_data_path,output_data_path)

if __name__ == "__main__":
    main()