In [19]:
# data_cleaning.py
# Contains functions to detect and handle inconsistent data and outliers
# Input:
#     - data/educatec_data/merged_educatec_moodle.csv
# Output:
#     - data/working_data/cleaned_data.csv

import os
import pandas as pd
import numpy as np
from pathlib import Path

def load_data(file_path):
    """
    Load the csv file from file_path
    :param file_path: Path - path to the csv file
    :return: pd.DataFrame - loaded dataframe
    """
    return pd.read_csv(file_path)

def replace_missing_values(df):
    """
    Replace 'sin respuesta' and 'sin fecha' with NaN
    :param df: pd.DataFrame - dataframe to process
    :return: pd.DataFrame - dataframe with missing values replaced
    """
    return df.replace(['Sin respuesta', 'Sin fecha', '0', 0, ''], pd.NA)

def detect_inconsistencies(df):
    """
    Detect and report inconsistencies in the dataset
    :param df: pd.DataFrame - dataframe to analyze
    :return: pd.DataFrame - dataframe with inconsistencies marked
    """
    for col in df.select_dtypes(include=[np.number]).columns:
        df[f'{col}_inconsistent'] = df[col] < 0
    return df

def impute_missing_values(df):
    """
    Impute missing values in the dataframe
    :param df: pd.DataFrame - dataframe to process
    :return: pd.DataFrame - dataframe with missing values imputed
    """
    # Impute numerical columns with mean
    for col in df.select_dtypes(include=[np.number]).columns:
        df[col] = df[col].fillna(df[col].mean())
    
    # Convert numerical columns to appropriate types to avoid warnings
    df = df.infer_objects(copy=False)

    # Impute categorical columns with mode
    for col in df.select_dtypes(exclude=[np.number]).columns:
        df[col] = df[col].fillna(df[col].mode()[0])

    # Convert categorical columns to appropriate types to avoid warnings
    df = df.infer_objects(copy=False)
    
    return df




def detect_outliers(df):
    """
    Detect and report outliers in the dataset
    :param df: pd.DataFrame - dataframe to analyze
    :return: pd.DataFrame - dataframe with outliers marked
    """
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[f'{col}_outlier'] = (df[col] < lower_bound) | (df[col] > upper_bound)
    return df

def cleaning(input_data_path, output_data_path):
    """
    Perform data cleaning on the dataset
    :param input_data_path: Path - path to the input dataset
    :param output_data_path: Path - path to save the cleaned dataset
    :return: None
    """
    dataset = load_data(input_data_path)
    dataset = replace_missing_values(dataset)
    dataset = detect_inconsistencies(dataset)
    dataset = impute_missing_values(dataset)
    dataset = detect_outliers(dataset)
    dataset.to_csv(output_data_path, index=False)


In [20]:
import os
from pathlib import Path

# Configurar rutas relativas para los directorios de datos
educatec_directory = os.path.join(os.getcwd(), "..", "..", "data", "educatec_data")
moodle_directory = os.path.join(os.getcwd(), "..", "..", "data", "moodle_data")
working_directory = os.path.join(os.getcwd(), "..", "..", "data", "working_data")

# Convertir a objetos Path y resolver las rutas
educatec_directory_path = Path(educatec_directory).resolve()
moodle_directory_path = Path(moodle_directory).resolve()
working_directory_path = Path(working_directory).resolve()

# Imprimir las rutas para verificar
print("Educatec Directory:", educatec_directory_path)
print("Moodle Directory:", moodle_directory_path)
print("Working Directory:", working_directory_path)


Educatec Directory: /Users/administrador/Downloads/Leaders4Edu/data/educatec_data
Moodle Directory: /Users/administrador/Downloads/Leaders4Edu/data/moodle_data
Working Directory: /Users/administrador/Downloads/Leaders4Edu/data/working_data


In [21]:
def main():
    """
    Main function
    :return: None
    """

    # Paths para los diferentes datasets
    input_data_path = educatec_directory_path / "educatec.csv"
    output_data_path = working_directory_path / "cleaned_data_educatec.csv"
    cleaning(input_data_path, output_data_path)
    
    input_data_path = moodle_directory_path / "course_modules_completion.csv"
    output_data_path = working_directory_path / "cleaned_data_course_modules_completion.csv"
    cleaning(input_data_path, output_data_path)

    input_data_path = moodle_directory_path / "course_modules.csv"
    output_data_path = working_directory_path / "cleaned_data_course_modules.csv"
    cleaning(input_data_path, output_data_path)

    input_data_path = moodle_directory_path / "user_info_data.csv"
    output_data_path = working_directory_path / "cleaned_data_user_info_data.csv"
    cleaning(input_data_path, output_data_path)

    input_data_path = moodle_directory_path / "users.csv"
    output_data_path = working_directory_path / "cleaned_data_users.csv"
    cleaning(input_data_path, output_data_path)

if __name__ == "__main__":
    main()


  df[col] = df[col].fillna(df[col].mode()[0])
  df[col] = df[col].fillna(df[col].mode()[0])
