In [1]:
### --- System and Path --- ###
import os
import sys
REPO_PATH = os.path.abspath(os.path.join('..')) # depend on specific directory structure
if REPO_PATH not in sys.path:
    sys.path.append(REPO_PATH)
import warnings
warnings.filterwarnings('ignore')

### --- Data Manipulation --- ###
import pandas as pd
import numpy as np
np.random.seed(42)

### --- Visualization --- ###
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-whitegrid')
# plt.rcParams["font.family"] = "tahoma" # TH font

import seaborn as sns
sns.set_theme(style="whitegrid")


In [2]:
df_train = pd.read_csv(REPO_PATH+'/data/table_FUNDRAISE_R2_TRAIN.csv')

# Quality check

In [3]:
def inconsistent(df):
    inconsistent_cols =  ['Recency', 'Frequency', 'Seniority', 'TotalGift', 'MinGift', 'MaxGift']
    # df = df.drop(columns=inconsistent_cols)

    # convert to numeric
    for col in inconsistent_cols:
        df[col] = df[col].str.strip()
        df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

def asdtype(df):
    dtype_dict = {
        'ID': object,
        'Contact': bool,
        'LastName': object,
        'FirstName': object,
        'Woman': bool,
        'Age': int,
        'Salary': int,
        'Education': object,
        'City': object,
        'SeniorList': int,
        'NbActivities': int,
        'Referrals': int,
        'Recency': 'Int64', # nullable integer type
        'Frequency': 'Int64',
        'Seniority': 'Int64',
        'TotalGift': 'Int64',
        'MinGift': 'Int64',
        'MaxGift': 'Int64',
        'GaveLastYear': bool,
        'AmtLastYear': int
    }
    df = df.astype(dtype_dict)

    if 'GaveThisYear' in df.columns:
        df['GaveThisYear'] = df['GaveThisYear'].astype(bool)
    if 'AmtThisYear' in df.columns:
        df['AmtThisYear'] = df['AmtThisYear'].astype(float)

    return df

In [4]:
df_train = inconsistent(df_train)
df_train = asdtype(df_train)

In [5]:
print("#Duplicate:", df_train.drop(columns=['ID'], inplace=False).duplicated().sum())

#Duplicate: 0


In [6]:
df_train.isnull().sum()

ID                   0
Contact              0
GaveThisYear         0
AmtThisYear          0
LastName            23
FirstName            4
Woman                0
Age                  0
Salary               0
Education            0
City                 0
SeniorList           0
NbActivities         0
Referrals            0
Recency         663666
Frequency       663666
Seniority       663666
TotalGift       663666
MinGift         663666
MaxGift         663666
GaveLastYear         0
AmtLastYear          0
dtype: int64

# Univariate analysis

In [17]:
# # plot scatter of AmtThisYear
# plt.figure(figsize=(15, 10))
# sns.scatterplot(data=df_train, x='ID', y='AmtThisYear')
# plt.title('Scatter plot of AmtThisYear')
# plt.show()

In [18]:
# # plot distribution of AmtThisYear
# plt.figure(figsize=(15, 10))
# sns.histplot(data=df_train, x='AmtThisYear', bins=100)
# plt.title('Distribution of AmtThisYear')
# plt.show()

# Bivariate analysis

In [35]:
# # plot bar of target vs categorical columns
# categorical_cols = ['Contact', 'City', 'Woman', 'Education', 'City','GaveLastYear']

# target = 'GaveThisYear'
# for col in categorical_cols:
#     plt.figure(figsize=(15, 10))
#     sns.countplot(data=df_train, x=col, hue=target)
#     plt.title(f'Count plot of {col} vs {target}')
#     plt.show()