In [1]:
# data_cleaning.py
# Contains functions to detect and handle inconsistent data and outliers
# Input:
#     - data/educatec_data/merged_educatec_moodle.csv
# Output:
#     - data/working_data/cleaned_data.csv

import os
import pandas as pd
import numpy as np
from pathlib import Path

def load_data(file_path):
    """
    Load the csv file from file_path
    :param file_path: Path - path to the csv file
    :return: pd.DataFrame - loaded dataframe
    """
    return pd.read_csv(file_path)

def replace_missing_values(df):
    """
    Replace 'sin respuesta' and 'sin fecha' with NaN
    :param df: pd.DataFrame - dataframe to process
    :return: pd.DataFrame - dataframe with missing values replaced
    """
    return df.replace(['Sin respuesta', 'Sin fecha','0',0,''], pd.NA)

def detect_inconsistencies(df):
    """
    Detect and report inconsistencies in the dataset
    :param df: pd.DataFrame - dataframe to analyze
    :return: pd.DataFrame - dataframe with inconsistencies marked
    """
    # Example inconsistency detection: Check for negative values in columns that should only have positives
    for col in df.select_dtypes(include=[np.number]).columns:
        df[f'{col}_inconsistent'] = df[col] < 0
    return df

def impute_missing_values(df):
    """
    Impute missing values in the dataframe
    :param df: pd.DataFrame - dataframe to process
    :return: pd.DataFrame - dataframe with missing values imputed
    """
    # Example: Impute numerical columns with mean and categorical with mode
    for col in df.columns:
        if df[col].dtype in [np.float64, np.int64]:
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)
    return df

def detect_outliers(df):
    """
    Detect and report outliers in the dataset
    :param df: pd.DataFrame - dataframe to analyze
    :return: pd.DataFrame - dataframe with outliers marked
    """
    # Example outlier detection using IQR method
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df[f'{col}_outlier'] = (df[col] < lower_bound) | (df[col] > upper_bound)
    return df

def cleaning(input_data_path, output_data_path):
    """
    Perform data cleaning on the dataset
    :param input_data_path: Path - path to the input dataset
    :param output_data_path: Path - path to save the cleaned dataset
    :return: None
    """
    # Load data
    dataset = load_data(input_data_path)
    # Replace missing values
    dataset = replace_missing_values(dataset)
    # Detect inconsistencies
    dataset = detect_inconsistencies(dataset)
    # Impute missing values
    dataset = impute_missing_values(dataset)
    # Detect outliers
    dataset = detect_outliers(dataset)
    # Save the cleaned dataframe to a csv file
    dataset.to_csv(output_data_path, index=False)



In [7]:
def main():
    """
    Main function
    :return: None
    """

    print("ESTAMOS AQUI: "+os.getenv('PATH'))
    # Define the path to the MIT dataset using the first method
    dataset_directory = os.path.join(
        os.getcwd(), "..", "..", "dataset", "original-data", "MIT-1"
    )
    print(f"Dataset directory using method 1: {dataset_directory}")

    # Define the path to the MIT dataset using the second method
    project_directory = Path(os.getcwd()).resolve().parents[1]
    dataset_directory_2 = project_directory / "dataset" / "original-data" / "MIT-1"
    print(f"Dataset directory using method 2: {dataset_directory_2}")

    # Paths
    base_data_path = Path("data")
    
    input_data_path = base_data_path / "educatec_data" / "educatec.csv"
    output_data_path = base_data_path / "working_data" / "cleaned_data_educatec.csv"
    cleaning(input_data_path, output_data_path)
    
    input_data_path = base_data_path / "moodle_data" / "course_modules_completion.csv"
    output_data_path = base_data_path / "working_data" / "cleaned_data_course_modules_completion.csv"
    cleaning(input_data_path, output_data_path)

    input_data_path = base_data_path / "moodle_data" / "course_modules.csv"
    output_data_path = base_data_path / "working_data" / "cleaned_data_course_modules.csv"
    cleaning(input_data_path, output_data_path)

    input_data_path = base_data_path / "moodle_data" / "user_info_data.csv"
    output_data_path = base_data_path / "working_data" / "cleaned_data_user_info_data.csv"
    cleaning(input_data_path, output_data_path)

    input_data_path = base_data_path / "moodle_data" / "users.csv"
    output_data_path = base_data_path / "working_data" / "cleaned_data_users.csv"
    cleaning(input_data_path, output_data_path)

if __name__ == "__main__":
    main()

ESTAMOS AQUI: /Users/administrador/.pyenv/versions/3.9.18/bin:/Users/administrador/.npm-global/bin:/usr/local/bin:/Library/Frameworks/Python.framework/Versions/3.12/bin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/Library/Apple/usr/bin:/Library/TeX/texbin:/Applications/VMware Fusion.app/Contents/Public:/usr/local/share/dotnet:~/.dotnet/tools:/Library/Frameworks/Mono.framework/Versions/Current/Commands:/opt/maven/apache-maven-3.8.7/bin
Dataset directory using method 1: /Users/administrador/Downloads/Leaders4Edu/src/quality_evaluation/../../dataset/original-data/MIT-1
Dataset directory using method 2: /Users/administrador/Downloads/Leaders4Edu/dataset/original-data/MIT-1


FileNotFoundError: [Errno 2] No such file or directory: 'data/educatec_data/educatec.csv'

No encuentra el fichero no se por qu√©