<a href="https://colab.research.google.com/github/sangho24/insight-12th/blob/main/EC5320_2024_2_Week9a_messy_tabular_data_v2_for_students_20200572.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#EC5320 Week9a codes: Working with messy tabular data

2024.10.29.<br>

Author: Hyunjoo Yang (hyang@sogang.ac.kr)<br><br>

This notebook uses Fastai and XGBoost to do classificaiton with messy tabular data.<br><br>

Data source:<br>
https://docs.fast.ai/tutorial.tabular.html <br>
http://archive.ics.uci.edu/ml/datasets/Adult



# 기말고사 project
코랩 파일, 워드 파일

1. 문제의식
2. 뭘 할 거
3. 사용하는 머신러닝 모델
4. 데이터 전처리
5. 결과
6. 의미와 결론
7.

재미있는 아이디어, 어떤 데이터를 모았는지, motivation은 뭐였는지 -> 기술 측면과 아이디어!

- 중간고사 내용  
## part1)
1.4 augmentation 과적합을 막기 위해 쓸 수 있다 or 방향을 고려해야 하는 데이터의 경우 augmentation을 진행하면 안 된다.
1.5 epoch는 시간 제약에 따라 자유롭게 설정할 수 있음. But patience를 늘릴수록 global maximum 에 도달할 확률이 높음, 돌리기 전에는 알 수 없음.
1.8 accuracy 정도에 따라 결정하면 되는데, domain마다 다름. 도박같은 경우 50.1만 넘어도 쓸만한 모델일 것.
1.11 validation set의 accuracy가 떨어지는 부분이 있다면 overfitting sign

## part2)
주어진 data에 rotation이 많으면 XGBoost 모델 쓰기 어려움
2.9 CNN, XGBoost 모두 사용, class별 확률을 평균을 낸다...?



# 1. Prepare data

In [None]:
import fastai
print(fastai.__version__)

#from fastai.vision.all import *
#from fastai.text.all import *
#from fastai.collab import *
from fastai.tabular.all import *

from matplotlib.pyplot import imshow

In [None]:
""" upload adult.csv """

In [None]:
df = pd.read_csv('adult.csv', low_memory=False)
df.sample(10)

In [None]:
df.shape

In [None]:
# check missing values

print(df.isnull().sum())

# 2. Classification with tabular data (FASTAI version)

In [None]:
# prepare fastai tabular data

to = TabularPandas(df, procs=[Categorify, FillMissing, Normalize],
                   cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race'],
                   cont_names = ['age', 'education-num'],
                   y_names='salary',
                   y_block = CategoryBlock,
                   splits=RandomSplitter(valid_pct=0.2)(range_of(df))
                   )

In [None]:
len(to.train), len(to.valid)

In [None]:
to.show(3)

In [None]:
to.items.head(3)

In [None]:
to.classes

In [None]:
# check X variables (xs)
to.xs.head()

In [None]:
# check y variables (xs)
to.ys.head()

In [None]:
# load tabular data to dataloader
dls = to.dataloaders(bs=64)

In [None]:
# begin training

beta = 2
what_to_monitor = 'fbeta_score' # precision_score, recall_score, fbeta_score

learn = tabular_learner(dls,
                        metrics=[accuracy, Precision(), Recall(), FBeta(beta)]).to_fp16()

# tabular_learner option: layers[500,250]

learn.path = Path('./')

learn.fit_one_cycle(20, cbs=[EarlyStoppingCallback(monitor=what_to_monitor, patience=5),
                                       SaveModelCallback(monitor=what_to_monitor)])

In [None]:
#learn.show_results()

In [None]:
# confusion matrix

fastai_interp = ClassificationInterpretation.from_learner(learn)
fastai_interp.plot_confusion_matrix()

In [None]:
# confusion matrix (normalized)

fastai_interp.plot_confusion_matrix(normalize=True)

In [None]:
# predict using an observation
df.iloc[0]

In [None]:
row, clas, probs = learn.predict(df.iloc[0])

In [None]:
row.show()

In [None]:
clas, probs

In [None]:
# predict using the whole df

test_df = df.copy()
test_df.drop(['salary'], axis=1, inplace=True)
dl = learn.dls.test_dl(test_df)

In [None]:
# get predictions

learn.get_preds(dl=dl)

In [None]:
# export preprocessed dfs

X_train_fastai, y_train_fastai = to.train.xs, to.train.ys.values.ravel()
X_valid_fastai, y_valid_fastai = to.valid.xs, to.valid.ys.values.ravel()

In [None]:
X_train_fastai

# 3. Classificaiton with XGBoost

## 3.1 install and import xgboost

In [None]:
import xgboost as xgb
from xgboost import cv

In [None]:
# set XGBoost regressor parameters

my_random_seed = 128

# use early stopping
early_stop_rounds = 10

xgb_classify_test = xgb.XGBClassifier(random_state=my_random_seed,
                                 early_stopping_rounds=early_stop_rounds)

In [None]:
%%time

## train

xgb_classify_test.fit(X_train_fastai, y_train_fastai,
            eval_set=[(X_valid_fastai, y_valid_fastai)])

## 3.2 Prepare train, valid, test dataset

In [None]:
# shuffle data set
my_seed = 42

from sklearn.utils import shuffle
df_shuffled = shuffle(df, random_state=my_seed)
df_shuffled

In [None]:
# set XX percent out of total sample (0.1, 0.3, 0.5, etc)
# (np.fix: nearest integer towards zero)

df_num_obs = df_shuffled.shape[0]

"""
num_train_samples = int(np.fix(df_num_obs * 0.6))
num_val_samples = int(np.fix(df_num_obs * 0.2))
num_test_samples = int(np.fix(df_num_obs * 0.2))
"""

# or set number

num_train_samples = 20000
num_val_samples = 3000
num_test_samples = 3000

print(num_train_samples, num_val_samples, num_test_samples)


num_total_samples = num_train_samples + num_val_samples + num_test_samples
num_train_val_samples = num_train_samples + num_val_samples


In [None]:
# split df into train, validation and test sets

df_train = df_shuffled[0:num_train_samples]

df_valid = df_shuffled[num_train_samples:num_train_samples + num_val_samples]

df_test = df_shuffled[num_train_samples + num_val_samples: num_total_samples]
#df_test = df_shuffled[-num_test_samples:]

print(df_train.shape[0], df_valid.shape[0], df_test.shape[0])

In [None]:
# prepare ground truth data by convert '<50k' '>=50k' to 0 and 1

print(df['salary'].value_counts())

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = le.fit_transform(df['salary'])

print('')
print('classes: {}'.format(le.classes_))
print('')

y

In [None]:
df['salary']

In [None]:
# split y into train, valid, test set

y_train = y[0:num_train_samples]

y_valid = y[num_train_samples:num_train_samples + num_val_samples]

y_test = y[num_train_samples + num_val_samples: num_total_samples]
#y_test = y[-num_test_samples:]

print(y_train.shape[0], df_valid.shape[0], df_test.shape[0])

y_train

## 3.3 preprocess variables

In [None]:
df_train.head()

In [None]:
print(df.isnull().sum())

How to deal with missing values? <br>


1.   Drop missing observations (df.dropna(subset=["education-num"]))
2.   Replace missing values with some stat (e.g., median or mean) using simple imputer



In [None]:
numerical_vars = ['age', 'education-num']
categorical_vars = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']

In [None]:
# get the list of columns with missing values

cols_with_missing = [col for col in df_train.columns
                                 if df_train[col].isnull().any()]
cols_with_missing

In [None]:
# data preprocessing pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

numeric_vars = ['age', 'education-num']
categorical_vars = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']

numeric_pipeline = Pipeline([
                             ('num imputer', SimpleImputer(strategy="median", add_indicator=True)),
                             ('std_scalder', StandardScaler())
                             ])

cat_pipeline = Pipeline([
                         ('cat imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
                         ('cat 1hot', OneHotEncoder(sparse_output=False, dtype=int))
])
# alternative: OrdinalEncoder

full_pipeline = ColumnTransformer([
                                   ("num", numeric_pipeline, numeric_vars),
                                   ("cat", cat_pipeline, categorical_vars)
                                   ])

In [None]:
df_train_prepared = pd.DataFrame(full_pipeline.fit_transform(df_train))
df_train_prepared

In [None]:
df_valid_prepared = pd.DataFrame(full_pipeline.transform(df_valid))
df_valid_prepared

## 3.4 train using xgboost

In [None]:
# set XGBoost regressor parameters

my_seed = 42

early_stop_rounds = 20


""" FOR REGRESSION
params = {'objective':'reg:squarederror', 'eval_metric':'rmse',
          'random_state':my_seed, 'nthread':-1, 'n_estimators':300
        }
xgb_reg = xgb.XGBRegressor(**params)
"""
# if gpu is used, add 'device':'cuda' below

params = {'objective':'binary:logistic', 'eval_metric':'error',
          'random_state':my_seed, 'nthread':-1, 'n_estimators':300,
          'early_stopping_rounds':early_stop_rounds
          }

# for multiclass classification: 'objective':'reg:squarederror' ('eval_metric':'merror')
# check: https://xgboost.readthedocs.io/en/stable/parameter.html

xgb_classify = xgb.XGBClassifier(**params)

#print(xgb_reg)

In [None]:
%%time

# train

xgb_classify.fit(df_train_prepared, y_train,
            eval_set=[(df_valid_prepared, y_valid)], )

In [None]:
# get validation set prediction results

y_pred_xgb = xgb_classify.predict(df_valid_prepared)
y_pred_xgb

In [None]:
# get validation set prediction results

y_pred_proba_xgb = xgb_classify.predict_proba(df_valid_prepared)
y_pred_proba_xgb

In [None]:
# confusion matrix plot

import itertools

def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix
    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']
    title:        the text to display at the top of the matrix
    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues
    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """

    accuracy = np.trace(cm) / np.sum(cm).astype('float')
    misclass = 1 - accuracy

    plt.figure(figsize=(12, 9))

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.3f}; misclass={:0.3f}'.format(accuracy, misclass))
    plt.show()

In [None]:
# confusion matrics

from sklearn.metrics import confusion_matrix

conf_mx = confusion_matrix(y_valid, y_pred_xgb)

In [None]:
# plot confusion matrix (raw)

my_classes = [0,1]

plot_confusion_matrix(conf_mx, my_classes, cmap=None, normalize=False)

In [None]:
plot_confusion_matrix(conf_mx, my_classes, cmap='Reds', normalize=True)

In [None]:
# set scale pos weight for imbalanced classification problems
# scale_pos_weight = total_negative_examples / total_positive_examples

unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)

my_scale_pos_weight = 15184 / 4816
my_scale_pos_weight

In [None]:
params2 = {'objective':'binary:logistic', 'eval_metric':'error',
           'random_state':my_seed, 'nthread':-1, 'n_estimators':300,
           'early_stopping_rounds':early_stop_rounds,
           'scale_pos_weight':my_scale_pos_weight
        }

# for multiclass classification: 'objective':'reg:squarederror' ('eval_metric':'merror')
# check: https://xgboost.readthedocs.io/en/stable/parameter.html

xgb_classify2 = xgb.XGBClassifier(**params2)

In [None]:
%%time

early_stop_rounds = 50

# use early stopping

xgb_classify2.fit(df_train_prepared, y_train,
            eval_set=[(df_valid_prepared, y_valid)])

In [None]:
# get validation set prediction results

y_pred_xgb2 = xgb_classify2.predict(df_valid_prepared)
y_pred_xgb2

In [None]:
conf_mx2 = confusion_matrix(y_valid, y_pred_xgb2)

In [None]:
plot_confusion_matrix(conf_mx2, my_classes, cmap='Reds', normalize=True)