In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('data/test_data.csv')
START_DATE = pd.to_datetime('1990-01-01')
END_DATE = pd.to_datetime('2050-01-01')

STRING_FORMATS = [
    '%Y-%m-%d %H:%M:%S',
    '%d-%m-%Y',
    '%m/%d/%Y %H:%M %p'
]

def datetime_parser(date_value):
    """
    A single, efficient function to parse a value that could be
    a Unix timestamp or one of several string formats.
    """
    numeric_val = pd.to_numeric(date_value, errors='coerce')
    if pd.notna(numeric_val):
        try:
            ts = pd.to_datetime(numeric_val, unit='s')
            if START_DATE <= ts <= END_DATE:
                return ts
        except (ValueError, pd.errors.OutOfBoundsDatetime):
            pass

    if isinstance(date_value, str):
        for fmt in STRING_FORMATS:
            try:
                return pd.to_datetime(date_value, format=fmt)
            except (ValueError, TypeError):
                continue

    return pd.NaT

df['timestamp'] = df['timestamp'].apply(datetime_parser)

df = df.sort_values(['user_id', 'timestamp'])

df = df.dropna(subset=['org_id', 'user_id'])

first_orgs = df.groupby('user_id')['org_id'].transform('first')
df = df[df['org_id'] == first_orgs]

df['timestamp'] = df['timestamp'].fillna(pd.to_datetime('1990-01-01 00:00:00'))

df_timestamps = df['timestamp']

df_credits = df['credits']

df_string_cols = df.select_dtypes('object')
for col in df_string_cols.columns:
    df_string_cols[col] = df_string_cols[col].str.strip()
    df_string_cols[col] = df_string_cols[col].str.lower()
df_string_cols

df = pd.concat([df_string_cols, df_credits, df_timestamps], axis=1)

df['credits'] = df['credits'].fillna(1.0)

df['credit_type'] = df['credit_type'].fillna('default')

df.sort_values(['org_id', 'user_id'], inplace=True)

df

Unnamed: 0,org_id,user_id,credit_type,action,credits,timestamp
121,org_000,user_001,report_download,deduct,58.0,2024-10-03 00:00:00
141,org_000,user_001,data_export,add,10.0,2024-10-08 12:00:00
341,org_000,user_001,data_export,add,17.0,2024-10-13 00:00:00
181,org_000,user_001,api_call,deduct,62.0,2024-10-14 12:00:00
221,org_000,user_001,report_download,deduct,86.0,2024-10-16 00:00:00
...,...,...,...,...,...,...
669,org_009,user_009,api_call,add,88.0,1990-01-01 00:00:00
709,org_009,user_009,data_export,deduct,20.0,1990-01-01 00:00:00
749,org_009,user_009,report_download,deduct,63.0,1990-01-01 00:00:00
769,org_009,user_009,report_download,add,18.0,1990-01-01 00:00:00


In [3]:
# df['user_id'] = df['user_id'].fillna('user_999')
# df

In [4]:
# df = df.drop_duplicates(subset=['org_id', 'user_id', 'credit_type', 'timestamp'], keep='first')
# df

In [5]:
df = df.sort_values(['user_id', 'timestamp'])
first_orgs = df.groupby('user_id')['org_id'].transform('first')
df = df[df['org_id'] == first_orgs]
df

Unnamed: 0,org_id,user_id,credit_type,action,credits,timestamp
600,org_001,user_000,default,deduct,82.0,1990-01-01 00:00:00
700,org_001,user_000,data_export,deduct,35.0,1990-01-01 00:00:00
740,org_001,user_000,api_call,deduct,31.0,1990-01-01 00:00:00
760,org_001,user_000,data_export,add,27.0,1990-01-01 00:00:00
800,org_001,user_000,api_call,add,47.0,1990-01-01 00:00:00
...,...,...,...,...,...,...
799,org_004,user_019,api_call,add,66.0,2024-12-22 00:00:00
419,org_004,user_019,data_export,deduct,69.0,2024-12-23 00:00:00
519,org_004,user_019,api_call,deduct,86.0,2024-12-23 12:00:00
859,org_004,user_019,report_download,add,74.0,2024-12-27 12:00:00


In [6]:
df.dtypes

org_id                 object
user_id                object
credit_type            object
action                 object
credits               float64
timestamp      datetime64[ns]
dtype: object

In [13]:
print(len(df['org_id'].unique()))
df['org_id'].unique()

10


array(['org_001', 'org_000', 'org_002', 'org_003', 'org_004', 'org_005',
       'org_006', 'org_007', 'org_008', 'org_009'], dtype=object)

In [14]:
print(len(df['user_id'].unique()))
df['user_id'].unique()

20


array(['user_000', 'user_001', 'user_002', 'user_003', 'user_004',
       'user_005', 'user_006', 'user_007', 'user_008', 'user_009',
       'user_010', 'user_011', 'user_012', 'user_013', 'user_014',
       'user_015', 'user_016', 'user_017', 'user_018', 'user_019'],
      dtype=object)