In [36]:
import re
from pathlib import Path
from dataclasses import dataclass

import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [24]:
@dataclass
class Config:
    REPO_ROOT: Path = Path("./")
    data_dir: Path = REPO_ROOT / "data"
    output_dir: Path = REPO_ROOT / "outputs"


config = Config()

### Bike Rentals (Regression)

In [4]:
def preprocess_bike_data(data_dir="./"):
    """Loads and preprocesses the bike sharing dataset."""

    bike = pd.read_csv(f"{data_dir}/bike+sharing+dataset/day.csv")
    bike["weekday"] = pd.Categorical(
        bike["weekday"], categories=range(7), ordered=True
    ).rename_categories(["SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"])
    bike["holiday"] = pd.Categorical(
        bike["holiday"], categories=[0, 1], ordered=True
    ).rename_categories(["NO HOLIDAY", "HOLIDAY"])
    bike["workingday"] = pd.Categorical(
        bike["workingday"], categories=[0, 1], ordered=True
    ).rename_categories(["NO WORKING DAY", "WORKING DAY"])
    bike["season"] = pd.Categorical(
        bike["season"], categories=range(1, 5), ordered=True
    ).rename_categories(["WINTER", "SPRING", "SUMMER", "FALL"])
    bike["weathersit"] = pd.Categorical(
        bike["weathersit"], categories=range(1, 4), ordered=True
    ).rename_categories(["GOOD", "MISTY", "RAIN/SNOW/STORM"])
    bike["mnth"] = pd.Categorical(
        bike["mnth"], categories=range(1, 13), ordered=True
    ).rename_categories(
        [
            "JAN",
            "FEB",
            "MAR",
            "APR",
            "MAY",
            "JUN",
            "JUL",
            "AUG",
            "SEP",
            "OCT",
            "NOV",
            "DEC",
        ]
    )

    bike["yr"] = np.where(bike["yr"] == 0, 2011, 2012)
    bike["yr"] = pd.Categorical(bike["yr"])
    bike["dteday"] = pd.to_datetime(bike["dteday"])
    bike["days_since_2011"] = (bike["dteday"] - bike["dteday"].min()).dt.days
    bike["temp"] = bike["temp"] * (39 - (-8)) + (-8)
    bike["atemp"] = bike["atemp"] * (50 - (16)) + (16)
    bike["windspeed"] = 67 * bike["windspeed"]
    bike["hum"] = 100 * bike["hum"]

    return bike.drop(columns=["instant", "dteday", "registered", "casual", "atemp"])


# TODO
# これ不要？
bike_features_of_interest = [
    "season",
    "holiday",
    "workingday",
    "weathersit",
    "temp",
    "hum",
    "windspeed",
    "days_since_2011",
]
df_bike = preprocess_bike_data(data_dir=config.data_dir)
df_bike_X = df_bike.drop("cnt", axis=1)
df_bike_y = df_bike["cnt"]
df_bike_X_train, df_bike_X_test, df_bike_y_train, df_bike_y_test = train_test_split(
    df_bike_X, df_bike_y, test_size=0.2, random_state=42
)

In [6]:
df_bike.columns
# memo
# "mnth","weekday"が余分の列として入っている

Index(['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'hum', 'windspeed', 'cnt', 'days_since_2011'],
      dtype='object')

### YouTube Spam Comments (Text Classification)

In [25]:
def clean_ycomments(html_string):
    if pd.isna(html_string):
        return html_string
    return re.sub("<.*?>", "", html_string)


# TODO
# 5MVをまとめて1つのファイルにした方がいいのか確認
df_ycomments = pd.read_csv(
    f"{config.data_dir}/youtube+spam+collection/Youtube01-Psy.csv"
)
df_ycomments["CONTENT"] = df_ycomments["CONTENT"].apply(clean_ycomments)
# Convert to ASCII
df_ycomments["CONTENT"] = (
    df_ycomments["CONTENT"]
    .astype(str)
    .str.encode("ascii", "ignore")
    .str.decode("ascii")
)

In [28]:
df_ycomments.head()
# memo
# "COMMENT_ID","AUTHOR","DATE"が余分の列として入っている

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .,1


### Risk Factors for Cervical Cancer (Classification)

In [41]:
df_rfcc = pd.read_csv(f"{config.data_dir}/risk_factors_cervical_cancer.csv")
df_rfcc = df_rfcc.drop(columns=["Citology", "Schiller", "Hinselmann"])
df_rfcc["Biopsy"] = pd.Categorical(
    df_rfcc["Biopsy"], categories=[0, 1], ordered=True
).rename_categories(["Healthy", "Cancer"])
df_rfcc = df_rfcc[
    [
        "Age",
        "Number of sexual partners",
        "First sexual intercourse",
        "Num of pregnancies",
        "Smokes",
        "Smokes (years)",
        "Hormonal Contraceptives",
        "Hormonal Contraceptives (years)",
        "IUD",
        "IUD (years)",
        "STDs",
        "STDs (number)",
        "STDs: Number of diagnosis",
        "STDs: Time since first diagnosis",
        "STDs: Time since last diagnosis",
        "Biopsy",
    ]
]
# Impute missing values using the most frequent value (mode)
imputer = SimpleImputer(strategy="most_frequent")
df_rfcc_imputed = imputer.fit_transform(
    df_rfcc.drop("Biopsy", axis=1)
)  # Fit SimpleImputer on numerical data only
df_rfcc_imputed = pd.DataFrame(
    df_rfcc_imputed, columns=df_rfcc.columns[:-1]
)  # Drop target column from output
df_rfcc = pd.concat(
    [df_rfcc_imputed, df_rfcc["Biopsy"]], axis=1
)  # Concatenate back numerical and target features