In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import re

import dill
import pandas as pd

import unimib_snowit_project.utils as u

# Setup

In [None]:
# Base Params

DATA_IN_DIR = 'data_input'

USERS_IN_FILENAME = 'users.csv'
PROFILES_IN_FILENAME = 'profiles.csv'
CARDS_IN_FILENAME = 'cards.csv'
ORDERS_IN_FILENAME = 'orders.csv'
ORDER_DETAILS_IN_FILENAME = 'order_details.csv'

DATA_PKL_DIR = 'data_loaded'

USERS_PKL_FILENAME = 'users.pkl'
PROFILES_PKL_FILENAME = 'profiles.pkl'
CARDS_PKL_FILENAME = 'cards.pkl'
ORDERS_PKL_FILENAME = 'orders.pkl'
ORDER_DETAILS_PKL_FILENAME = 'order_details.pkl'

NA_VALUES = ['', ' ', '""',
             '#N/A', '#N/A N/A', '#NA', 'N/A', '<NA>', 'n/a', # 'NA',
             '-1.#IND', '1.#IND',
             '-1.#QNAN', '-NaN', '-nan', '-NAN', '1.#QNAN', 'NaN', 'nan', 'NAN',
             'NULL', 'Null', 'null',
             'NONE', 'None', 'none',
             ]

In [None]:
# Base paths

root_dir_path = u.get_root_dir()

data_in_dir_path = root_dir_path.joinpath(DATA_IN_DIR)
users_in_path = data_in_dir_path.joinpath(USERS_IN_FILENAME)
profiles_in_path = data_in_dir_path.joinpath(PROFILES_IN_FILENAME)
cards_in_path = data_in_dir_path.joinpath(CARDS_IN_FILENAME)
orders_in_path = data_in_dir_path.joinpath(ORDERS_IN_FILENAME)
order_details_in_path = data_in_dir_path.joinpath(ORDER_DETAILS_IN_FILENAME)

data_pkl_dir_path = root_dir_path.joinpath(DATA_PKL_DIR)
users_pkl_path = data_pkl_dir_path.joinpath(USERS_PKL_FILENAME)
profiles_pkl_path = data_pkl_dir_path.joinpath(PROFILES_PKL_FILENAME)
cards_pkl_path = data_pkl_dir_path.joinpath(CARDS_PKL_FILENAME)
orders_pkl_path = data_pkl_dir_path.joinpath(ORDERS_PKL_FILENAME)
order_details_pkl_path = data_pkl_dir_path.joinpath(ORDER_DETAILS_PKL_FILENAME)

# LOAD

## Load Users

In [None]:
safeload_users_df = pd.read_csv(users_in_path,
                                dtype='string',
                                na_values=[],
                                keep_default_na=False
                                )

In [None]:
safeload_users_df.columns

In [None]:
# col_to_check = 'favouriteZones'
# safeload_users_df[col_to_check].drop_duplicates()

In [None]:
# Read and fix
users_df = pd.read_csv(users_in_path,
                       keep_default_na=False,
                       na_values=NA_VALUES,
                       dtype={
                           'user.uid': 'string',
                           'createdAt': 'string',
                           'source': 'string',
                           'isAnonymous': 'boolean',
                           'referralsCount': 'Int64',
                           'city': 'string',
                           'language': 'string',
                           'googleId': 'boolean',
                           'appleId': 'boolean',
                           'facebookId': 'boolean',
                           'referral.medium': 'string',
                           'referral.source': 'string',
                           'referral.type': 'Int64',
                           'favouriteZones': 'string'
                       }
                       )

users_df['createdAt'] = pd.to_datetime(users_df['createdAt'])

users_df['city'] = (users_df['city']
                    .apply(lambda x:
                           u.clean_str(x, 'lower')
                           if pd.notnull(x)
                           else None
                           )
                    )

users_df['referral.medium'] = (users_df['referral.medium']
                        .apply(lambda x:
                               u.clean_str(x, 'lower')
                               if pd.notnull(x)
                               else None
                               )
                        )

users_df['referral.source'] = (users_df['referral.source']
                        .apply(lambda x:
                               u.clean_str(x, 'lower')
                               if pd.notnull(x)
                               else None
                               )
                        )

users_df['favouriteZones'] = (users_df['favouriteZones']
                              .apply(lambda x:
                                     u.get_list_from_str(x)
                                     if pd.notnull(x)
                                     else []
                                     )
                              )

In [None]:
# CHECK PK VALIDITY

# SELECT count(1) as num_rows
# FROM users_df
# WHERE user.uid IS NULL

display(
    users_df
    .loc[lambda tbl: tbl['user.uid'].isnull()]
    .assign(aux=1.0)
    .shape[0]
)

# SELECT user.uid, count(1) as num_rows
# FROM users_df
# GROUP BY user.id
# HAVING num_rows > 1

display(
    users_df
    .assign(aux=1.0)
    .groupby(['user.uid'], dropna=False)
    .agg(num_rows=('aux', pd.Series.count))
    .loc[lambda tbl: tbl['num_rows'] > 1]
)

## Load Profiles

In [None]:
safeload_profiles_df = pd.read_csv(profiles_in_path,
                                   dtype='string',
                                   na_values=[],
                                   keep_default_na=False
                                   )

In [None]:
safeload_profiles_df.columns

In [None]:
# col_to_check = 'types'
# safeload_profiles_df[col_to_check].drop_duplicates()

In [None]:
# Read and fix
profiles_df = pd.read_csv(profiles_in_path,
                       keep_default_na=False,
                       na_values=NA_VALUES,
                       dtype={
                           'user.uid': 'string',
                           'profile.uid': 'string',
                           'birthday': 'string',
                           'sex': 'string',
                           'city': 'string',
                           'height': 'Float64',
                           'weight': 'Float64',
                           'skibootsSize': 'Float64',
                           'level': 'string',
                           'types': 'string'
                       }
                       )

profiles_df['birthday'] = pd.to_datetime(profiles_df['birthday'])

def clean_profile_sex(sex: str) -> str | None:
    clean = u.clean_str(sex, 'upper')
    if clean in ['M', 'F']:
        return clean
    elif clean == ['UOMO', 'MASCHIO']:
        return 'M'
    elif clean == ['DONNA', 'FEMMINA']:
        return 'F'
    else:
        None
profiles_df['sex'] = (profiles_df['sex']
                      .apply(lambda x:
                             clean_profile_sex(x)
                             if pd.notnull(x)
                             else None
                             )
                      )

profiles_df['city'] = (profiles_df['city']
                    .apply(lambda x:
                           u.clean_str(x, 'lower')
                           if pd.notnull(x)
                           else None
                           )
                    )
profiles_df['types'] = (profiles_df['types']
                              .apply(lambda x:
                                     u.get_list_from_str(x)
                                     if pd.notnull(x)
                                     else []
                                     )
                              )

In [None]:
# CHECK PK VALIDITY

# SELECT count(1) as num_rows
# FROM profiles_df
# WHERE profile.uid IS NULL

display(
    profiles_df
    .loc[lambda tbl: tbl['profile.uid'].isnull()]
    .assign(aux=1.0)
    .shape[0]
)

# SELECT profile.uid, count(1) as num_rows
# FROM profiles_df
# GROUP BY profile.id
# HAVING num_rows > 1

display(
    profiles_df
    .assign(aux=1.0)
    .groupby(['profile.uid'], dropna=False)
    .agg(num_rows=('aux', pd.Series.count))
    .loc[lambda tbl: tbl['num_rows'] > 1]
)

In [None]:
# CHECK FK VALIDITY

# SELECT
#   A.user.uid,
#   count(1) as num_rows
# FROM 
#   (SELECT DISTINCT user.uid
#   FROM profiles_df
#   WHERE user.uid IS NOT NULL) AS A
#   LEFT JOIN
#   (SELECT user.uid, 1.0 AS in_users
#   FROM users_df) AS B
#   ON A.user.uid = B.user.uid
# GROUP BY in_users
# HAVING num_rows > 1

(profiles_df
 [['user.uid']]
 .loc[lambda tbl: tbl['user.uid'].notnull()]
 .drop_duplicates()
 .merge(users_df[['user.uid']].assign(in_users=1.0),
        how='left',
        on='user.uid'
        )
 .assign(aux=1.0)
 .groupby(['in_users'], dropna=False)
 .agg(num_rows=('aux', pd.Series.count))
 .loc[lambda tbl: tbl['num_rows'] > 1]
)

In [None]:
(profiles_df
 [['user.uid']]
 .loc[lambda tbl: tbl['user.uid'].notnull()]
 .drop_duplicates()
 .merge(users_df[['user.uid']].assign(in_users=1.0),
        how='left',
        on='user.uid'
        )
 .loc[lambda tbl: tbl['in_users'].isnull()]
)

In [None]:
profile_fail_useruids = (profiles_df
    [['user.uid']]
    .loc[lambda tbl: tbl['user.uid'].notnull()]
    .drop_duplicates()
    .merge(users_df[['user.uid']].assign(in_users=1.0),
            how='left',
            on='user.uid'
            )
    .loc[lambda tbl: tbl['in_users'].isnull()]
    ['user.uid']
)

display(profile_fail_useruids)

profile_fail_useruid_df = profiles_df.loc[lambda tbl: tbl['user.uid'].isin(profile_fail_useruids)]

display(profile_fail_useruid_df)

In [None]:
# FIX FK ISSUE
profiles_df.drop(list(profile_fail_useruid_df.index), inplace=True)

In [None]:
display(
    profiles_df
    [['user.uid']]
    .loc[lambda tbl: tbl['user.uid'].notnull()]
    .drop_duplicates()
    .merge(users_df[['user.uid']].assign(in_users=1.0),
            how='left',
            on='user.uid'
            )
    .assign(aux=1.0)
    .groupby(['in_users'], dropna=False)
    .agg(num_rows=('aux', pd.Series.count))
    .loc[lambda tbl: tbl['num_rows'] > 1]
)

## Load Cards

In [None]:
safeload_cards_df = pd.read_csv(cards_in_path,
                                dtype='string',
                                na_values=[],
                                keep_default_na=False
                                )

In [None]:
safeload_cards_df.columns

In [None]:
# ...

In [None]:
# Read and fix
cards_df = ...

## Load Orders

In [None]:
safeload_orders_df = pd.read_csv(orders_in_path,
                                 dtype='string',
                                 na_values=[],
                                 keep_default_na=False
                                )

In [None]:
safeload_orders_df.columns

In [None]:
# col_to_check = 'clientInfo'
# safeload_orders_df[col_to_check].drop_duplicates()

In [None]:
# Read and fix
orders_df = ...

## Load Order Details

In [None]:
safeload_order_details_df = pd.read_csv(order_details_in_path,
                                        dtype='string',
                                        na_values=[],
                                        keep_default_na=False
                                        )

In [None]:
safeload_order_details_df.columns

In [None]:
# ...

In [None]:
# Read and fix
order_details_df = ...

# Save

In [None]:
# Save Cleaned Dataset

with users_pkl_path.open('wb') as fh:
    dill.dump(users_df, fh)
print(f"Save users data in {users_pkl_path.as_posix()}")

with profiles_pkl_path.open('wb') as fh:
    dill.dump(profiles_df, fh)
print(f"Save profiles data in {profiles_pkl_path.as_posix()}")

with cards_pkl_path.open('wb') as fh:
    dill.dump(cards_df, fh)
print(f"Save cards data in {cards_pkl_path.as_posix()}")

with orders_pkl_path.open('wb') as fh:
    dill.dump(orders_df, fh)
print(f"Save orders data in {orders_pkl_path.as_posix()}")

with order_details_pkl_path.open('wb') as fh:
    dill.dump(order_details_df, fh)
print(f"Save order details data in {order_details_pkl_path.as_posix()}")