# About this notebook

## 目的 : fast aiとembedding + UMAPを見ることによって、
## 機械学習がどのように進んでいくかを見ていく。


## Objective: See how machine learning progresses by looking at fast ai and embedding + UMAP.

# 1. Import & Load

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
from fastai.tabular.all import *

In [None]:
train = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
train.head()

In [None]:
test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")
test.head()

In [None]:
submission= pd.read_csv("/kaggle/input/digit-recognizer/sample_submission.csv")
submission.head()

# 2. 初期のUMAP状態を見る
## View initial UMAP status

In [None]:
import cudf, cuml, cupy
from cuml import UMAP

In [None]:
train.head()

In [None]:
umap = UMAP(random_state=42)
embed_2d = umap.fit_transform(train.iloc[:,1:].values)
embed_2d = cupy.asnumpy( embed_2d )

In [None]:
plt.scatter(embed_2d[:,0],embed_2d[:,1],c = train["label"] )

## すでにlabelごとにある程度固まっている
## Already solidified to some extent for each label

# 3. 機械学習後のUMAP特徴量
UMAP features after machine learning

## 3.1 5kfoldに分けて1fold目で実験
Divide into 5kfold and experiment with the 1st fold

In [None]:
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [None]:
Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(Fold.split(train, train["label"])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
print(train.groupby(['fold', "label"]).size())

# seedの固定 : fix seed

In [None]:
seed=42
#fastai　内のライブラリ
set_seed(seed, reproducible=True)

# 数値データ、ターゲットの定義 : define features and target

In [None]:
train.head()

In [None]:
train.columns[1:-1]

In [None]:
#catlist = ["Sex","Embarked"] # define string ( categorical data ). 文字列の列名
contlist = train.columns[1:-1].to_list()# define continuous data. 数字の列名
TARGET = "label" # target . 予想するもの

In [None]:
validind = train[train["fold"]==0].index # validationのindex
validind[:3]

In [None]:
BATCH_SIZE = 256

In [None]:
len(validind)

# DataLoader

In [None]:
dls = TabularDataLoaders.from_df(train, # dataframe
                                 path='.',  
                                 #procs=[Normalize], # 文字列の数字変換、中央値でfillna、平均減算標準偏差で割って正規化 
                                 cat_names=None, # 文字列リスト。文字列ない場合はNoneでOK
                                 cont_names=contlist, # 数字列リスト。ない場合はNoneでOK
                                 y_names=TARGET, # target
                                 y_block=CategoryBlock, # CategoryBlock, MultiCategoryBlock, RegressionBlockから選ぶ noneでも可 
                                 valid_idx=validind, # Validationのindex
                                 bs=BATCH_SIZE, # バッチサイズ
                                 shuffle_train=True, # 学習時にトレインデータshuffleするか
                                 shuffle=False,
                                 val_shuffle=False,
                                 n=None, 
                                 device="cuda", # gpu使用時は"cuda"
                                 drop_last=None, # dataloaderでbatchsizeに余りがでたら、切るか。
                                 val_bs=BATCH_SIZE * 2 # validationのbatch size
                                 )

In [None]:
dls.train.show_batch()

# Model

In [None]:
learn = tabular_learner(dls, # dataset.先ほど作成
                        layers=[1000,500,200],  # neuralnetの中間層
                        emb_szs=None,  
                        config=None, 
                        n_out=None, # 出力する数.Noneだと自動認識してくれる
                        y_range=None, 
                        loss_func=CrossEntropyLossFlat(), # ↑のリンク参照
                        opt_func=Adam,  # optimizer
                        lr=0.001, # 学習率
                        splitter=trainable_params, 
                        
                        
                        cbs=[
                            
                            SaveModelCallback(monitor="accuracy",comp=np.greater), # 学習後に一番良かったモデルsaveとload 
                             EarlyStoppingCallback(monitor="accuracy",comp=np.greater, patience=3), # モニターしてpatience更新しなかったらやめる
                          #   GradientClip, # 重みづけをいきすぎないように
                          #   ReduceLROnPlateau(monitor='accuracy',comp=np.greater, patience=10,factor = 10) # 学習率モニターしながら変えたい人はこれを入れる
                           ],


                        metrics=accuracy, # metric 
                        path=None, 
                        model_dir='models', # savepathだと思うが、後で自分でやるからいい。 
                        wd=None, 
                        wd_bn_bias=False,
                        train_bn=True, 
                        moms=(0.95, 0.85, 0.95)
                        )

In [None]:
learn.model

In [None]:
learn.lr_find()

In [None]:
learn.fit(1, 1e-3)

## viaualize embedding

In [None]:
learn.model.layers[3][0] = nn.Identity()

In [None]:
trn = train[train["fold"]!=0].reset_index(drop=True)

In [None]:
trn_dl = dls.test_dl(trn)

In [None]:
pred, _ = learn.get_preds(dl=trn_dl)

In [None]:
pred = np.array(pred)

In [None]:
umap = UMAP(random_state=42)
embed_2d = umap.fit_transform(pred)
embed_2d = cupy.asnumpy( embed_2d )

In [None]:
plt.scatter(embed_2d[:,0],embed_2d[:,1],c = trn["label"] )

# 少し離れた : A little away

# 10 epoch

In [None]:
learn = tabular_learner(dls, # dataset.先ほど作成
                        layers=[1000,500,200],  # neuralnetの中間層
                        emb_szs=None,  
                        config=None, 
                        n_out=None, # 出力する数.Noneだと自動認識してくれる
                        y_range=None, 
                        loss_func=CrossEntropyLossFlat(), # ↑のリンク参照
                        opt_func=Adam,  # optimizer
                        lr=0.001, # 学習率
                        splitter=trainable_params, 
                        
                        
                        cbs=[
                            
                            SaveModelCallback(monitor="accuracy",comp=np.greater), # 学習後に一番良かったモデルsaveとload 
                           #  EarlyStoppingCallback(monitor="accuracy",comp=np.greater, patience=3), # モニターしてpatience更新しなかったらやめる
                          #   GradientClip, # 重みづけをいきすぎないように
                          #   ReduceLROnPlateau(monitor='accuracy',comp=np.greater, patience=10,factor = 10) # 学習率モニターしながら変えたい人はこれを入れる
                           ],


                        metrics=accuracy, # metric 
                        path=None, 
                        model_dir='models', # savepathだと思うが、後で自分でやるからいい。 
                        wd=None, 
                        wd_bn_bias=False,
                        train_bn=True, 
                        moms=(0.95, 0.85, 0.95)
                        )

In [None]:
learn.fit(10, 1e-3)

In [None]:
learn.model.layers[3][0] = nn.Identity()
trn = train[train["fold"]!=0].reset_index(drop=True)
trn_dl = dls.test_dl(trn)
pred, _ = learn.get_preds(dl=trn_dl)

pred = np.array(pred)

umap = UMAP(random_state=42)
embed_2d = umap.fit_transform(pred)
embed_2d = cupy.asnumpy( embed_2d )

plt.scatter(embed_2d[:,0],embed_2d[:,1],c = trn["label"] )

# かなりきれいに分離していっている。
It's separated pretty well.