# Set Up

In [None]:
# Import libraries
import os
import re
from datetime import datetime

import numpy as np
import pandas as pd

### Read in Data

In [None]:
input_path = "/Users/minji.kang/Documents/NGDT/Data_export_management/Report_CSV_Preprocessing_Generic_Script/NIMH/input/"
output_path = "/Users/minji.kang/Documents/NGDT/Data_export_management/Report_CSV_Preprocessing_Generic_Script/NIMH/output/"


def read_and_bind_df(mypath):
    all_files = os.listdir(mypath)
    report_files = [file for file in all_files if file.startswith("report")]
    report_df = []
    for i in range(len(report_files)):
        temp_df = pd.read_csv(
            os.path.join(mypath, report_files[i]), encoding="ISO-8859-1"
        )
        report_df.append(temp_df)
    report = pd.concat(report_df, ignore_index=True)
    report.rename(columns={report.columns[0]: "id"}, inplace=True)
    return report


df = read_and_bind_df(input_path)
# df.to_csv(os.path.join('/Users/minji.kang/','report_all.csv'),index=False)

In [None]:
# df.to_csv(os.path.join('/Users/minji.kang/','report_all.csv'),index=False)

### Add Timezone offset

In [None]:
def add_timezone_offset(mydata, columntoaddto):
    col_values = pd.to_numeric(mydata[columntoaddto], errors="coerce")
    timezone_offsets = pd.to_numeric(mydata["timezone_offset"], errors="coerce")
    timezone_offsets = timezone_offsets.fillna(0)  # Replace NaN with 0 for offsets
    return col_values + (timezone_offsets * 60 * 1000)


df["start_Time"] = add_timezone_offset(df, "activity_start_time")
df["end_Time"] = add_timezone_offset(df, "activity_end_time")
df["schedule_Time"] = add_timezone_offset(df, "activity_scheduled_time")

### Group by min start_Time & max End_Time

In [None]:
# dat_processed = df.groupby(['secret_user_id', 'activity_flow_id', 'activity_scheduled_time'], group_keys=True).apply(lambda x: x.assign(start_Time=x['start_Time'].min(), end_Time=x['end_Time'].max())).reset_index(drop=True)

### Score options replacements and removing unnecessary characters

In [None]:
def val_score_mapping(data):
    response_scores = []  # List to store results

    for i in range(len(data["response"])):
        options = data["options"][i]
        response = data["response"][i]
        # clean_response = re.sub(r"value: |geo: ", "", response)
        if isinstance(response, str):
            clean_response = re.sub(r"value: |geo: ", "", response)
        else:
            clean_response = np.nan

        # Ensure 'options' and 'response' are valid strings
        if not isinstance(options, str) or not isinstance(response, str):
            response_scores.append(clean_response)  # Append NaN for invalid rows
            continue

        if re.search(r"score: ", options):
            split_options = options.strip().split("),")
            split_response = response.strip().split(": ")[1].split(",")
            scores = {}

            for j in split_options:
                if "(score" in j:  # Ensure the string contains the expected structure
                    val_parts = j.split("(score")
                    if len(val_parts) == 2 and ": " in val_parts[0]:
                        val_num = val_parts[0].split(": ")[1].strip()
                        score_num = val_parts[1].split(": ")[1].strip(" )")
                        scores[val_num] = score_num

            response_score_mapping = {
                resp.strip(): scores.get(resp.strip(), "N/A")  # Handle missing mappings
                for resp in split_response
            }
            response_scores.append(", ".join(response_score_mapping.values()))
        else:
            response_scores.append(
                clean_response
            )  # Append NaN if no valid scores are found

    return pd.Series(response_scores)


# df['response_scores'] = val_score_mapping(df)

### Formatting time and time_range items

In [None]:
def format_responses(data):
    formatted_responses = []  # List to store the formatted responses

    for i in range(len(data)):
        response = data["response"].iloc[i]

        # Ensure response is a string, or handle NaN/invalid values
        if not isinstance(response, str):
            formatted_responses.append(response)  # Leave as is for non-string values
            continue

        # Handle 'time:' entries
        if re.search(r"time:", response):
            if re.search(r"hr [0-9],", response):  # Single-digit hour
                egapp = response.replace("time: hr ", "0")
                if re.search(r", min [0-9]$", egapp):  # Single-digit minute
                    egtemp = egapp.replace(", min ", ":0")
                elif re.search(r", min [0-9][0-9]$", egapp):  # Two-digit minute
                    egtemp = egapp.replace(", min ", ":")
            elif re.search(r"hr [0-9][0-9],", response):  # Two-digit hour
                egapp = response.replace("time: hr ", "")
                if re.search(r", min [0-9]$", egapp):  # Single-digit minute
                    egtemp = egapp.replace(", min ", ":0")
                elif re.search(r", min [0-9][0-9]$", egapp):  # Two-digit minute
                    egtemp = egapp.replace(", min ", ":")

            # Convert to formatted time
            egpos = datetime.strptime(egtemp, "%H:%M")
            formatted_responses.append(egpos.strftime("%H:%M"))

        # Handle 'time_range:' entries
        elif re.search(r"time_range:", response):
            # Extract times and format them
            t = re.sub(r"[a-zA-Z\s+(\)_:]", "", response)  # Remove unwanted characters
            t = t.replace(",", ":")  # Replace commas with colons
            time_parts = t.split("/")  # Split the time range into two parts

            # Format each time part
            formatted_parts = []
            for part in time_parts:
                hours, minutes = part.split(":")
                hours = hours.zfill(2)  # Ensure hours are two digits
                minutes = minutes.zfill(2)  # Ensure minutes are two digits
                formatted_parts.append(f"{hours}:{minutes}")

            # Combine the formatted parts back into the time range
            formatted_responses.append("/".join(formatted_parts))

        # Handle other cases
        else:
            formatted_responses.append(response)  # Keep the response unchanged

    return pd.Series(formatted_responses)  # Return as a pandas Series


# df['formatted_responses'] = format_responses(df)

### Converting epoch time to regular time

In [None]:
def format_epochtime(data, column_name):
    epoch_converted = []
    epoch_converted = pd.to_numeric(data[column_name], errors="coerce")
    return pd.to_datetime(epoch_converted / 1000, unit="s")


df["start_Time"] = format_epochtime(df, "start_Time")
df["end_Time"] = format_epochtime(df, "end_Time")
df["schedule_Time"] = format_epochtime(df, "schedule_Time")

### Processing Responses Function (combined score mapping+cleaning and formatting)

In [None]:
def process_responses(data, clean=True, map_scores=True, format_time=True):
    processed_responses = []  # List to store the processed responses

    for i in range(len(data)):
        options = data["options"].iloc[i] if "options" in data else None
        response = data["response"].iloc[i]

        processed = False

        # Ensure response is a string
        if not isinstance(response, str):
            response = str(response) if not pd.isna(response) else None

        # Clean the response
        if clean:
            # Handle geo: entries
            if isinstance(response, str) and re.search(r"geo:", response):
                # Extract latitude and longitude
                geo_match = re.search(
                    r"geo:\s*lat\s*\((.*?)\)\s*/\s*long\s*\((.*?)\)", response
                )
                if geo_match:
                    lat = geo_match.group(1).strip()
                    long = geo_match.group(2).strip()
                    geo_cleaned = f"{lat}/{long}"  # Combine them as desired
                    processed_responses.append(geo_cleaned)
                    processed = True
                    continue

            # General cleaning
            clean_response = (
                re.sub(r"value:", "", response) if isinstance(response, str) else np.nan
            )
            if not isinstance(response, str):
                processed_responses.append(clean_response)
                processed = True
                continue

        # Format time or time range entries
        if format_time and isinstance(response, str):
            if re.search(r"time:", response):
                try:
                    if re.search(r"hr [0-9],", response):  # Single-digit hour
                        temp_response = response.replace("time: hr ", "0")
                        if re.search(
                            r", min [0-9]$", temp_response
                        ):  # Single-digit minute
                            temp_response = temp_response.replace(", min ", ":0")
                        elif re.search(
                            r", min [0-9][0-9]$", temp_response
                        ):  # Two-digit minute
                            temp_response = temp_response.replace(", min ", ":")
                    elif re.search(r"hr [0-9][0-9],", response):  # Two-digit hour
                        temp_response = response.replace("time: hr ", "")
                        if re.search(
                            r", min [0-9]$", temp_response
                        ):  # Single-digit minute
                            temp_response = temp_response.replace(", min ", ":0")
                        elif re.search(
                            r", min [0-9][0-9]$", temp_response
                        ):  # Two-digit minute
                            temp_response = temp_response.replace(", min ", ":")

                    # Extract only the valid time format part
                    valid_time = re.search(r"\d{2}:\d{2}", temp_response)
                    if valid_time:
                        formatted_time = datetime.strptime(valid_time.group(), "%H:%M")
                        processed_responses.append(formatted_time.strftime("%H:%M"))
                        processed = True
                    else:
                        processed_responses.append(
                            np.nan
                        )  # Append NaN for invalid formats
                        processed = True
                except Exception:
                    processed_responses.append(np.nan)
                    processed = True
                    continue

            elif re.search(r"time_range:", response):
                try:
                    clean_time = re.sub(r"[a-zA-Z\s+(\)_:]", "", response).replace(
                        ",", ":"
                    )
                    time_parts = clean_time.split("/")
                    formatted_parts = [
                        f"{part.split(':')[0].zfill(2)}:{part.split(':')[1].zfill(2)}"
                        for part in time_parts
                    ]
                    processed_responses.append("/".join(formatted_parts))
                    processed = True
                except Exception:
                    processed_responses.append(np.nan)
                    processed = True
                    continue

        # Map scores
        if map_scores and isinstance(response, str) and isinstance(options, str):
            if re.search(r"score: ", options):
                split_options = options.strip().split("),")
                split_response = response.strip().split(": ")[1].split(",")
                scores = {}

                for j in split_options:
                    if "(score" in j:
                        val_parts = j.split("(score")
                        if len(val_parts) == 2 and ": " in val_parts[0]:
                            val_num = val_parts[0].split(": ")[1].strip()
                            score_num = val_parts[1].split(": ")[1].strip(" )")
                            scores[val_num] = score_num

                response_score_mapping = {
                    resp.strip(): scores.get(resp.strip(), "N/A")
                    for resp in split_response
                }
                processed_responses.append(", ".join(response_score_mapping.values()))
                processed = True
                continue

        # Fallback case
        if not processed:
            processed_responses.append(
                response if isinstance(response, str) else np.nan
            )

    return pd.Series(processed_responses)


dat_processed = df.copy()
dat_processed["new_responses"] = process_responses(df)

### Widening Function

In [None]:
dat_processed = df.copy()
dat_processed["new_responses"] = process_responses(df)

mycolumn_list = [
    "userId",
    "secret_user_id",
    "source_user_secret_id",
    "source_user_nickname",
    "source_user_tag",
    "source_user_relation",
    "target_user_secret_id",
    "target_user_nickname",
    "target_user_tag",
    "input_user_secret_id",
    "input_user_nickname",
    "schedule_Time",
    "start_Time",
    "end_Time",
    "activity_flow_id",
    "activity_flow_name",
    "event_id",
    "version",
]
myresponse_column_name = "new_responses"


def widen_data(data, column_list, response_column_name):
    data[column_list] = data[column_list].fillna("")
    datetime_cols = data.select_dtypes(include=["datetime"]).columns
    data[datetime_cols] = (
        data[datetime_cols].replace("", pd.NaT).fillna(pd.Timestamp("1900-01-01"))
    )
    # data.loc[:, data.select_dtypes(include=['datetime']).columns] = data.select_dtypes(include=['datetime']).fillna(pd.Timestamp('1900-01-01'))
    data_grouped = (
        data.groupby(
            ["secret_user_id", "activity_flow_id", "activity_scheduled_time"],
            group_keys=True,
        )
        .apply(
            lambda x: x.assign(
                start_Time=x["start_Time"].min(), end_Time=x["end_Time"].max()
            )
        )
        .reset_index(drop=True)
    )
    answers = (
        data_grouped.groupby(column_list, group_keys=False)["id"]
        .apply(lambda x: "|".join(x.astype(str)))
        .reset_index()
    )
    data_grouped["combined_cols"] = (
        data_grouped[["activity_id", "item_id", "item"]]
        .astype(str)
        .agg("_".join, axis=1)
    )
    subset_columns = column_list + ["combined_cols", response_column_name]
    dat_subset = data_grouped[subset_columns]
    dat_wide = pd.pivot_table(
        dat_subset,
        index=column_list,
        columns="combined_cols",
        values=response_column_name,
        aggfunc="last",
    ).reset_index()
    return pd.merge(dat_wide, answers, on=column_list, how="outer")


data_wide = widen_data(dat_processed, mycolumn_list, myresponse_column_name)
data_wide.to_csv(os.path.join(output_path, "data_wide_all.csv"), index=False)

data_wide.head()

In [None]:
activities_only = df.copy()
activities_only = activities_only[activities_only["activity_flow_id"].isna()]
activities_only["new_responses"] = process_responses(activities_only)

mycolumn_list = [
    "userId",
    "secret_user_id",
    "source_user_secret_id",
    "source_user_nickname",
    "source_user_tag",
    "source_user_relation",
    "target_user_secret_id",
    "target_user_nickname",
    "target_user_tag",
    "input_user_secret_id",
    "input_user_nickname",
    "schedule_Time",
    "start_Time",
    "end_Time",
    "activity_flow_id",
    "activity_flow_name",
    "event_id",
    "version",
]
myresponse_column_name = "new_responses"

data_wide2 = widen_data(activities_only, mycolumn_list, myresponse_column_name)
data_wide2.to_csv(
    os.path.join(output_path, "data_wide_activities_only.csv"), index=False
)

data_wide2.head()