In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# インポート

In [None]:
from fastai.tabular.all import *

# https://docs.fast.ai/

他にも画像系やテキスト系などに合ったライブラリがある。
今回はtitanicのデータがタブ型なのでtabular

In [None]:
train = pd.read_csv("../input/titanic/train.csv")
train.head()

In [None]:
test = pd.read_csv("../input/titanic/test.csv")
test.head()

In [None]:
submission= pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
submission.head()

# Kfoldの準備

In [None]:
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [None]:
Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(Fold.split(train, train["Survived"])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
print(train.groupby(['fold', "Survived"]).size())

# seedの固定

In [None]:
seed=42
#fastai　内のライブラリ
set_seed(seed, reproducible=True)

# 数値データ、カテゴリーデータ、ターゲットの定義

In [None]:
catlist = ["Sex","Embarked"] # define string ( categorical data ). 文字列の列名
contlist = ["Pclass","Age","SibSp","Parch"] # define continuous data. 数字の列名
TARGET = "Survived" # target . 予想するもの

In [None]:
validind = train[train["fold"]==0].index # validationのindex
validind[:3]

In [None]:
BATCH_SIZE = 32

In [None]:
len(validind)

# DataLoaderの宣言

In [None]:
dls = TabularDataLoaders.from_df(train, # dataframe
                                 path='.',  
                                 procs=[ Categorify,FillMissing,Normalize], # 文字列の数字変換、中央値でfillna、平均減算標準偏差で割って正規化 
                                 cat_names=catlist, # 文字列リスト。文字列ない場合はNoneでOK
                                 cont_names=contlist, # 数字列リスト。ない場合はNoneでOK
                                 y_names=TARGET, # target
                                 y_block=CategoryBlock, # CategoryBlock, MultiCategoryBlock, RegressionBlockから選ぶ noneでも可 
                                 valid_idx=validind, # Validationのindex
                                 bs=BATCH_SIZE, # バッチサイズ
                                 shuffle_train=True, # 学習時にトレインデータshuffleするか
                                 shuffle=False,
                                 val_shuffle=False,
                                 n=None, 
                                 device=None, # gpu使用時は"cuda"
                                 drop_last=None, # dataloaderでbatchsizeに余りがでたら、切るか。
                                 val_bs=BATCH_SIZE * 2 # validationのbatch size
                                 )

In [None]:
dls

In [None]:
dls.train.show_batch()

In [None]:
len(dls.valid)

In [None]:
dls.valid.show_batch()

# 学習モデルの定義

In [None]:
learn = tabular_learner(dls, # dataset.先ほど作成
                        layers=[1000,500,200],  # neuralnetの中間層
                        emb_szs=None,  
                        config=None, 
                        n_out=None, # 出力する数.Noneだと自動認識してくれる
                        y_range=None, 
                        loss_func=CrossEntropyLossFlat(), # ↑のリンク参照
                        opt_func=Adam,  # optimizer
                        lr=0.001, # 学習率
                        splitter=trainable_params, 
                        
                        
                        cbs=[
                            
                            SaveModelCallback(monitor="accuracy",comp=np.greater), # 学習後に一番良かったモデルsaveとload 
                          #   EarlyStoppingCallback(monitor="accuracy",comp=np.greater, patience=30), # モニターしてpatience更新しなかったらやめる
                          #   GradientClip, # 重みづけをいきすぎないように
                          #   ReduceLROnPlateau(monitor='accuracy',comp=np.greater, patience=10,factor = 10) # 学習率モニターしながら変えたい人はこれを入れる
                           ],


                        metrics=accuracy, # metric 
                        path=None, 
                        model_dir='models', # savepathだと思うが、後で自分でやるからいい。 
                        wd=None, 
                        wd_bn_bias=False,
                        train_bn=True, 
                        moms=(0.95, 0.85, 0.95)
                        )

In [None]:
learn.model

In [None]:
learn.lr_find()

In [None]:
learn.fit(100, 1e-4)

In [None]:
learn.recorder.plot_loss()

In [None]:
learn.show_results()

In [None]:
learn.show_training_loop()

# モデルの保存

In [None]:
#learn.save(f"fastai_fold{0}") # save pth

learn.export(f"fastai_fold{0}.pkl") # pickleでsave.容量が抑えられるので、画像処理とかではこちらが良い。

# torch.save(learn.model,f"fastai_fold{0}.pth") # learnとしてsaveするのではなく、pytorch modelとしてsave

In [None]:
validdf = train[train["fold"]==0].reset_index(drop=True)
validdf.head()

In [None]:
val_dl = dls.test_dl(validdf)

In [None]:
val_dl.show_batch()

In [None]:
pred, _ = learn.get_preds(dl=val_dl)

In [None]:
pred

In [None]:
learn.predict(validdf.iloc[0])

In [None]:
pred, _ = learn.get_preds(1)
len(pred)

In [None]:
pred

In [None]:
preds = np.array(pred)

preds = [s.argmax() for s in preds]
preds[:3]

In [None]:
seed = 42
set_seed(seed, reproducible=True)

preds = []

for i in range(5):

    validind = train[train["fold"]==i].index # validationのindex

    dls = TabularDataLoaders.from_df(train, # dataframe
                                     path='.',  
                                     procs=[ Categorify,FillMissing,Normalize], # 文字列の数字変換、中央値でfillna、平均減算標準偏差で割って正規化 
                                     cat_names=catlist, # 文字列リスト。文字列ない場合はNoneでOK
                                     cont_names=contlist, # 数字列リスト。ない場合はNoneでOK
                                     y_names=TARGET, # target
                                     y_block=CategoryBlock, # CategoryBlock, MultiCategoryBlock, RegressionBlockから選ぶ noneでも可 
                                     valid_idx=validind, # Validationのindex
                                     bs=BATCH_SIZE, # バッチサイズ
                                     shuffle_train=True, # 学習時にトレインデータshuffleするか
                                     shuffle=False,
                                     val_shuffle=False,
                                     n=None, 
                                     device=None, # gpu使用時は"cuda"
                                     drop_last=None, # dataloaderでbatchsizeに余りがでたら、切るか。
                                     val_bs=BATCH_SIZE * 2 # validationのbatch size
                                     )

    learn = tabular_learner(dls, # dataset.先ほど作成
                            layers=[1000,500,200],  # neuralnetの中間層
                            emb_szs=None,  
                            config=None, 
                            n_out=None, # 出力する数.Noneだと自動認識してくれる
                            y_range=None, 
                            loss_func=CrossEntropyLossFlat(), # ↑のリンク参照
                            opt_func=Adam,  # optimizer
                            lr=1e-4, # 学習率
                            splitter=trainable_params, 


                            cbs=[

                                SaveModelCallback(monitor="accuracy",comp=np.greater), # 学習後に一番良かったモデルsaveとload 
                              #   EarlyStoppingCallback(monitor="accuracy",comp=np.greater, patience=30), # モニターしてpatience更新しなかったらやめる
                              #   GradientClip, # 重みづけをいきすぎないように
                              #   ReduceLROnPlateau(monitor='accuracy',comp=np.greater, patience=10,factor = 10) # 学習率モニターしながら変えたい人はこれを入れる
                               ],


                            metrics=accuracy, # metric 
                            path=None, 
                            model_dir='models', # savepathだと思うが、後で自分でやるからいい。 
                            wd=None, 
                            wd_bn_bias=False,
                            train_bn=True, 
                            moms=(0.95, 0.85, 0.95)
                            )

    learn.fit(100, 1e-4)

    #learn.save(f"fastai_fold{0}") # save pth

    learn.export(f"fastai_fold{i}.pkl") # pickleでsave.容量が抑えられるので、画像処理とかではこちらが良い。

    # torch.save(learn.model,f"fastai_fold{0}.pth") # learnとしてsaveするのではなく、pytorch modelとしてsave


    #validdf = train[train["fold"]==0].reset_index(drop=True)
    

    #val_dl = dls.test_dl(validdf)
    test_dl = dls.test_dl(test)

    #pred, _ = learn.get_preds(dl=test_dl)
    pred, _ = learn.get_preds(dl=test_dl)
    

    preds.append(pred)




In [None]:
preds2 = [ np.array(s) for s in preds]

In [None]:
len(preds2)

In [None]:
submission.Survived = np.argmax(np.mean(preds2, axis=0), axis=1)

In [None]:
submission.to_csv("submission.csv", index=False)