In [1]:
import pandas as pd
df = pd.read_csv("ShellData.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,TagName,EventTime,Status,Value
0,0,A2PS64V0J.:ZUX09R,2024-01-02 20:03:46.000,Good,0.34
1,1,A2PS64V0J.:ZUX09R,2024-01-02 16:00:12.000,Good,0.15
2,2,A2PS64V0J.:ZUX09R,2024-01-02 11:56:42.000,Good,0.13
3,3,A2PS64V0J.:ZUX09R,2024-01-02 07:53:11.000,Good,0.12
4,4,A2PS64V0J.:ZUX09R,2024-01-02 03:49:45.000,Good,0.13


In [2]:
df = df.drop(columns=["Unnamed: 0"])

In [3]:
def separate_textual_attrivutes_from_numerical_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Checks whether the specified column contains non-numeric string values.
    If yes:
      - converts numeric-looking strings (e.g., "3.14", "1e-5") into real numbers
      - creates a new column <column>_str containing non-numeric strings
      - replaces those original values in <column> with -1
      - fills missing entries in <column>_str with the string "NaN"
    Returns a modified copy of the DataFrame, leaving the original unchanged.
    """

    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")

    # Create a copy so the original DataFrame is not modified
    df_copy = df.copy()

    # Helper: check if something is a string that looks like a number
    def is_numeric_string(x):
        if not isinstance(x, str):
            return False
        try:
            float(x)
            return True
        except ValueError:
            return False

    # Convert numeric-looking strings to real numbers
    df_copy[column] = df_copy[column].apply(
        lambda x: float(x) if is_numeric_string(x) else x
    )

    # Identify non-numeric strings
    def is_non_numeric_string(x):
        return isinstance(x, str) and not is_numeric_string(x)

    # Create the new string column
    df_copy[f"{column}_str"] = df_copy[column].where(
        df_copy[column].apply(is_non_numeric_string)
    )
    df_copy[f"{column}_str"] = df_copy[f"{column}_str"].fillna("NaN")

    # Replace non-numeric strings in the main column with -1
    df_copy[column] = df_copy[column].apply(
        lambda x: -1 if is_non_numeric_string(x) else x
    )

    return df_copy

In [4]:
value_sep_df = separate_textual_attrivutes_from_numerical_column(df, "Value")

In [5]:
value_sep_df["Value_str"].value_counts()

Value_str
NaN              214500000
Calc Failed         231627
Bad Input            98155
Failed               47516
No Result            47516
Out of Serv          43917
Bad                  19920
Scan Timeout          1452
Comm Fail              694
I/O Timeout            208
Doubtful                41
Not Connect             34
Pt Created              12
Invalid Float            8
Scan Off                 2
Name: count, dtype: int64

In [6]:
def convert_string_column_to_datetime_inplace(
    df: pd.DataFrame,
    column: str,
    fmt_with_frac: str = "%Y-%m-%d %H:%M:%S.%f",
    out_col: str | None = None,
    strip_trailing_000: bool = True,
) -> pd.DataFrame:
    """
    Converts a string-based timestamp column to datetime (in-place result).
    - If 'strip_trailing_000' is True, values ending with '.000' are trimmed
      and parsed without fractional seconds.
    - All other values are parsed using the provided 'fmt_with_frac' format.
    - Invalid entries are converted to NaT.
    """

    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in the DataFrame.")
    out_col = out_col or f"{column}_DT"

    # Convert column to string once (avoids dtype mix issues)
    s = df[column].astype(str)
    # Prepare result column
    result = pd.Series(pd.NaT, index=df.index, dtype="datetime64[ns]")

    if strip_trailing_000:
        # Identify entries that end with '.000'
        mask000 = s.str.endswith(".000")

        # Fast path: remove '.000' and parse without microseconds
        if mask000.any():
            result.loc[mask000] = pd.to_datetime(
                s.loc[mask000].str[:-4],
                format="%Y-%m-%d %H:%M:%S",
                errors="coerce",
            )

        # Remaining values: parse using the full format (with %f)
        other = ~mask000
        if other.any():
            result.loc[other] = pd.to_datetime(
                s.loc[other],
                format=fmt_with_frac,
                errors="coerce",
            )
    else:
        # Single-pass parsing if we do not strip trailing zeros
        result[:] = pd.to_datetime(s, format=fmt_with_frac, errors="coerce")

    # Assign result column directly into the same DataFrame (in-place)
    df[out_col] = result
    return df


In [7]:
dt_df = convert_string_column_to_datetime_inplace(value_sep_df, "EventTime", "%Y-%m-%d %H:%M:%S.%f")

In [8]:
dt_df["EventTime_DT"]

0           2024-01-02 20:03:46
1           2024-01-02 16:00:12
2           2024-01-02 11:56:42
3           2024-01-02 07:53:11
4           2024-01-02 03:49:45
                    ...        
214991097   2024-06-29 15:57:58
214991098   2024-06-29 15:35:41
214991099   2024-06-29 15:35:41
214991100   2024-06-29 15:35:41
214991101   2024-06-29 15:35:41
Name: EventTime_DT, Length: 214991102, dtype: datetime64[ns]

In [9]:
def one_hot_encode(df: pd.DataFrame,column: str,sparse: bool = False)-> pd.DataFrame:
    return pd.get_dummies(df, columns=[column], sparse=sparse)

In [10]:
ohe_df = one_hot_encode(dt_df, "Status")

In [11]:
ohe_df

Unnamed: 0,TagName,EventTime,Value,Value_str,EventTime_DT,Status_Bad,Status_Good,Status_Questionable,"Status_Substituted, Good"
0,A2PS64V0J.:ZUX09R,2024-01-02 20:03:46.000,0.34,,2024-01-02 20:03:46,False,True,False,False
1,A2PS64V0J.:ZUX09R,2024-01-02 16:00:12.000,0.15,,2024-01-02 16:00:12,False,True,False,False
2,A2PS64V0J.:ZUX09R,2024-01-02 11:56:42.000,0.13,,2024-01-02 11:56:42,False,True,False,False
3,A2PS64V0J.:ZUX09R,2024-01-02 07:53:11.000,0.12,,2024-01-02 07:53:11,False,True,False,False
4,A2PS64V0J.:ZUX09R,2024-01-02 03:49:45.000,0.13,,2024-01-02 03:49:45,False,True,False,False
...,...,...,...,...,...,...,...,...,...
214991097,SHRQHC:2VR_XN7_V,2024-06-29 15:57:58.000,-1.00,Calc Failed,2024-06-29 15:57:58,True,False,False,False
214991098,Ceia3wAoRc35lrPLMr1.lP.xSvRDPiw: K0Fde.sprEeT,2024-06-29 15:35:41.000,-1.00,No Result,2024-06-29 15:35:41,True,False,False,False
214991099,.EF.KiPDrteoLvpar5Td3eePRLd3RRctC0cai:PS.1,2024-06-29 15:35:41.000,-1.00,Failed,2024-06-29 15:35:41,True,False,False,False
214991100,:oCLE1FqT.3pc5.e0PRK.TriS3RP,2024-06-29 15:35:41.000,-1.00,No Result,2024-06-29 15:35:41,True,False,False,False


In [14]:
def add_time_features(df: pd.DataFrame, datetime_col: str) -> pd.DataFrame:
    """
    Expand a DataFrame with time-based features from a datetime column.
    Robust parsing for mixed datetime formats (ISO8601 / with or without microseconds).
    """
    # Validate column presence
    if datetime_col not in df.columns:
        raise KeyError(f"Column '{datetime_col}' not found in DataFrame.")

    out = df.copy()

    # 1) Ensure datetime dtype with robust parsing for mixed formats
    #    Try ISO8601 first (fast & strict), then mixed, then final fallback with errors='coerce'.
    if not pd.api.types.is_datetime64_any_dtype(out[datetime_col]):
        try:
            # Fast path for ISO strings (handles "YYYY-MM-DD HH:MM:SS[.ffffff][Z][±HH:MM]")
            out[datetime_col] = pd.to_datetime(out[datetime_col], format="ISO8601")
        except Exception:
            try:
                # Pandas 2.x: parse per-element with inferred format (handles microseconds optional)
                out[datetime_col] = pd.to_datetime(out[datetime_col], format="mixed")
            except Exception:
                # Last resort: coerce bad rows to NaT so we can at least proceed and diagnose
                out[datetime_col] = pd.to_datetime(out[datetime_col], errors="coerce")

    dt = out[datetime_col].dt

    # 2) Add numeric parts
    out[f"{datetime_col}_day"]   = dt.day
    out[f"{datetime_col}_week"]  = dt.isocalendar().week.astype("Int16")

    # 3) Seconds since midnight (works even with NaT → becomes <NA>)
    out[f"{datetime_col}_seconds"] = (dt.hour * 3600 + dt.minute * 60 + dt.second).astype("Int32")

    # 4) Month as ordered categorical (no per-row strings allocated)
    month_categories = [
        "January","February","March","April","May","June",
        "July","August","September","October","November","December"
    ]
    month_codes = (dt.month - 1).astype("Int8")
    month_codes = month_codes.where(~month_codes.isna(), other=-1)
    out[f"{datetime_col}_month"] = pd.Categorical.from_codes(
        codes=month_codes.astype("int8"),
        categories=month_categories,
        ordered=True
    )

    # 5) Weekday as ordered categorical (0=Mon .. 6=Sun)
    weekday_categories = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
    wday_codes = dt.dayofweek.astype("Int8")
    wday_codes = wday_codes.where(~wday_codes.isna(), other=-1)
    out[f"{datetime_col}_weekday"] = pd.Categorical.from_codes(
        codes=wday_codes.astype("int8"),
        categories=weekday_categories,
        ordered=True
    )

    return out


In [15]:
dt_df = add_time_features(ohe_df, "EventTime")
dt_df

Unnamed: 0,TagName,EventTime,Value,Value_str,EventTime_DT,Status_Bad,Status_Good,Status_Questionable,"Status_Substituted, Good",EventTime_day,EventTime_week,EventTime_seconds,EventTime_month,EventTime_weekday
0,A2PS64V0J.:ZUX09R,2024-01-02 20:03:46,0.34,,2024-01-02 20:03:46,False,True,False,False,2,1,72226,January,Tuesday
1,A2PS64V0J.:ZUX09R,2024-01-02 16:00:12,0.15,,2024-01-02 16:00:12,False,True,False,False,2,1,57612,January,Tuesday
2,A2PS64V0J.:ZUX09R,2024-01-02 11:56:42,0.13,,2024-01-02 11:56:42,False,True,False,False,2,1,43002,January,Tuesday
3,A2PS64V0J.:ZUX09R,2024-01-02 07:53:11,0.12,,2024-01-02 07:53:11,False,True,False,False,2,1,28391,January,Tuesday
4,A2PS64V0J.:ZUX09R,2024-01-02 03:49:45,0.13,,2024-01-02 03:49:45,False,True,False,False,2,1,13785,January,Tuesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214991097,SHRQHC:2VR_XN7_V,2024-06-29 15:57:58,-1.00,Calc Failed,2024-06-29 15:57:58,True,False,False,False,29,26,57478,June,Saturday
214991098,Ceia3wAoRc35lrPLMr1.lP.xSvRDPiw: K0Fde.sprEeT,2024-06-29 15:35:41,-1.00,No Result,2024-06-29 15:35:41,True,False,False,False,29,26,56141,June,Saturday
214991099,.EF.KiPDrteoLvpar5Td3eePRLd3RRctC0cai:PS.1,2024-06-29 15:35:41,-1.00,Failed,2024-06-29 15:35:41,True,False,False,False,29,26,56141,June,Saturday
214991100,:oCLE1FqT.3pc5.e0PRK.TriS3RP,2024-06-29 15:35:41,-1.00,No Result,2024-06-29 15:35:41,True,False,False,False,29,26,56141,June,Saturday
