# Pytorch Neural Network Starter
## About this notebook
*  This notebook is basic code for neural network starter using pytorch including Kfold, label encoding, one-hot encoding and so on.
*  This notebook is detail version in order to understand easily. Short version will be made.
*  If this is helpful for you, please upvote.



*  このノートブックは、NaNの処理、Kfold, ラベルエンコーディング、ワンホットエンコーディングなどの様々な場面で使うコードを含む、pytorchのニューラルネットワークの基礎コードです。
*  理解を深めるために、長く書いています。ショートバージョンも作ります。
*  もし、お役に立ちましたら、upvoteしてくれると嬉しいです。



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset,Dataset

import gc

import random

import transformers
import warnings
warnings.simplefilter('ignore')

#scaler = torch.cuda.amp.GradScaler() # GPUでの高速化。

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # cpuがgpuかを自動判断
device

# Fixing Random seed in order to get reproducability. 再現性確保のためのランダムシード固定

In [None]:
SEED = 508

def random_seed(SEED):
    
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

random_seed(SEED)

![image.png](attachment:a82168a7-f857-41f3-8134-6299c5ff1924.png)

# 0. Confirming the train/test data : データの確認


In [None]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
df

In [None]:
test = pd.read_csv("../input/titanic/test.csv")
test

## About data
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	# of siblings / spouses aboard the Titanic	
parch	# of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	
cabin	Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

#########日本語#################

survival	生死	0 = 死亡, 1 = 生存
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	性別	
Age	年齢	
sibsp	# of siblings / 親族の数	
parch	# of parents / 子供の数	
ticket	Ticket number　チケットナンバー	
fare	Passenger fare	運賃
cabin	Cabin number	部屋の番号
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton　乗船した場所


In [None]:
df.info()

#### There are 891 rows. There are NaN data in Age, Cabin, Embarked.
#### 全部で891行あって、Age,Cabin,Embarkedにはnullデータはないが、NaNデータがありそう。

In [None]:
df.columns

In [None]:
for col in df.columns:
    print(str(col) + ":" + str(len(df[col].unique())))

![image.png](attachment:da5e624e-1e46-4dfd-b29f-6e9a8a74e772.png)

# 1. Handling of NaN values : 欠損値の処理
### 1.1 filling an mean value. : 平均値で埋める

In [None]:
df["Age"]

In [None]:
# In order to get the average value, dropna is used to remove all but Nan data.
# 平均値を出すために、dropnaでNanデータ以外を抜きます。
df["Age"].dropna()

In [None]:
dmean = df["Age"].dropna().mean()
dmean

In [None]:
# filling an mean value. : 平均値で埋める

df["Age"] = df["Age"].fillna(dmean)

In [None]:
df

In [None]:
test["Age"] = test["Age"].fillna(dmean)

![image.png](attachment:09e034ba-3b5e-4b8d-a3fc-cb86ea40a018.png)

# 2 label encoding
#### Automatically convert strings to numbers. Since there is a significant difference such as 0 and 1, Sex is divided in this way.
#### 文字列を数字に自動変換. 0と1など有意差があるため、このやり方で性別を分けています。

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()
le.fit(df["Sex"])
df["Sex"] = le.transform(df["Sex"])
test["Sex"] = le.transform(test["Sex"])

In [None]:
df

![image.png](attachment:ef42746a-7eb2-467e-a91c-5fa76bf8c6cd.png)

# 3. One-hot encoding
#### Automatically convert strings to numbers line by line. 
#### There are four Embarked places including NaN, but since no significant difference can be considered, divide them in parallel.
#### 文字列を行ごとに数字に自動変換。EmbarkedはNaNを入れて4つあるが、有意差が考えられないので、並列に分けます。

In [None]:
# firstly, combining df and test. Combine vertically with axis = 0.
# トレインデータとテストデータを結合します。axis = 0 で縦に結合。

dfall = pd.concat([df,test],axis=0)
dfall

In [None]:
# one-hot encoding using pd.get_dummies. NaN can also be separated by dummy_na = True.
# get_dummiesを使って、one-hot encodingします。dummy_na=TrueでNaNも分けることが可能。

dfall2 = pd.get_dummies(dfall["Embarked"],dummy_na=True)
dfall2

In [None]:
# Combine horizontally with axis = 1.
# axis = 1 で横に結合。

dfall = pd.concat([dfall,dfall2],axis=1)
dfall

In [None]:
# Separate the train data and test data and restore them.
# トレインデータとテストデータに分離して元に戻します。

train = dfall.iloc[:len(df),:]
test = dfall.iloc[len(df):,:]

In [None]:
train

In [None]:
test

![image.png](attachment:86cbcb8d-bab8-493a-b8bf-1672810567ba.png)

# 4. Kfold
#### Prepare training data and verification data in 5 combinations.
#### 訓練データと検証データを5つの組み合わせで準備する。

In [None]:
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [None]:
folds = train.copy()
Fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for n, (train_index, val_index) in enumerate(Fold.split(folds, folds["Survived"])):
    folds.loc[val_index, 'fold'] = int(n)
folds['fold'] = folds['fold'].astype(int)
print(folds.groupby(['fold', "Survived"]).size())

In [None]:
folds

## for practice, fold0 is defined as validation, fold1-4 are defined as train
## 練習のために、まず、fold0を検証データ、fold1-4を訓練データとします。

In [None]:
p_train = folds[folds["fold"] != 0]
p_val = folds[folds["fold"] == 0]

In [None]:
p_train

In [None]:
# An error will occur later, so reassign the index.
# 後ほどエラーが出るので、indexを振りなおす。

p_train = p_train.reset_index(drop=True)
p_val = p_val.reset_index(drop=True)

In [None]:
p_train

![image.png](attachment:ca855327-bec4-4383-8489-a591b4f805ce.png)

# 5.Neural Network using pytorch

![image.png](attachment:fcab6a0d-7d12-421a-923d-cbd5509975e3.png)

# 5.1 Defining features and target
##     特徴量とラベル(ターゲット)を定義します

In [None]:
# defining the feature columns and the target

FEATURES = ["Pclass","Sex","Age","SibSp","Parch","C","Q","S",np.nan]
TARGET = "Survived"

In [None]:
p_train[FEATURES]

![image.png](attachment:cd563819-fd4d-49b5-8098-893b10a186ca.png)

# 5.2 Dataset/DataLoader
## Dataset : Set the combination of features and correct answers.　
#### → 特徴量と正解の組み合わせをセットします。イメージでいうと、1つのビニール袋の中に特徴量1個と正解1個を入れるイメージ（これが1行分)。


## 5.2.1 まずは1つやってみる 

In [None]:
p_train[FEATURES]

In [None]:
p_train[TARGET]

#### 5.2.1.1 DataFrame → Numpy

In [None]:
train_X = np.array(p_train[FEATURES])
train_Y = np.array(p_train[TARGET])

val_X = np.array(p_val[FEATURES])
val_Y = np.array(p_val[TARGET])

In [None]:
train_X[:3]

#### 5.2.1.2 Numpy → Normalization

In [None]:
from sklearn.preprocessing import StandardScaler
Normarizescaler = StandardScaler()
Normarizescaler.fit(np.array(train[FEATURES]))

In [None]:
train_X = Normarizescaler.transform(train_X)
val_X = Normarizescaler.transform(val_X)

In [None]:
train_X[:3]

#### 5.2.1.4 Normalization → torch

In [None]:
train_X = torch.from_numpy(train_X).float()
train_Y = torch.from_numpy(train_Y).long() # long : int64

val_X = torch.from_numpy(val_X).float()
val_Y = torch.from_numpy(val_Y).long() # long : int64

In [None]:
train_X[:3]

#### 5.2.1.5 Tensor Dataset

In [None]:
train_dataset = TensorDataset(train_X,train_Y)
val_dataset = TensorDataset(val_X,val_Y)

In [None]:
train_dataset[0]

![image.png](attachment:ab3252e6-9f52-4389-9adc-c5e19dc1d4d6.png)

# 5.3 Making class : In Pytorch, when creating a Dataset, it is often created as a class as shown below. Therefore, I will explain how to do this.

Pytorchでは、よくDatasetを作るときに以下のように、classにして作成することが多いです。そのため、このやり方を説明します。

In [None]:
class PytorchDataSet(Dataset):
    
    def __init__(self,df):
        
        # for test data, In test data, it's easier to fill it with something on purpose.
        # テストデータも読み込めるようにわざと9999で埋めています。
        
        if "Survived" not in df.columns:
            df["Survived"] = 9999
        
        self.df = df
        
        self.train_X = np.array(self.df[FEATURES])
        self.train_Y = np.array(self.df[TARGET])
        
        self.train_X = Normarizescaler.transform(self.train_X)
        
        self.train_X = torch.from_numpy(self.train_X).float()
        self.train_Y = torch.from_numpy(self.train_Y).long() # long : int64

    def __len__(self):
        
        return len(self.df)
    
    def __getitem__(self,idx):
        
        return {"X":self.train_X[idx],"Y":self.train_Y[idx]}

In [None]:
train_dataset = PytorchDataSet(p_train)
val_dataset = PytorchDataSet(p_val)
test_dataset = PytorchDataSet(test)

In [None]:
train[FEATURES].head(3)

You can see that the feature amount on the 0th line and the label are combined.


0行目の特徴量とlabelの組み合わせができていることがわかる

## Dataloader : Change the Dataset to a batch processing (multiple processing) format.
#### → Datasetをバッチ処理(複数処理)するフォーマットに変えます。Dataloaderは、Datasetで作った1つの袋を大きな紙袋に入れて、紙袋ごと処理するイメージ。この場合、1つの紙袋(Dataloader)には、trainingでは256個のビニール袋を詰め込むイメージ。

In [None]:
train_dataloader = DataLoader(train_dataset,batch_size=256,shuffle = True)
val_dataloader = DataLoader(val_dataset,batch_size=256*2,shuffle = False)
test_dataloader = DataLoader(test_dataset,batch_size=256*2,shuffle = False)

In [None]:
for a in train_dataloader:
    print(a)
    break

![image.png](attachment:7d246e9d-d007-4f9f-8bf7-bc709443a53b.png)

# 5.4 Modeling

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__() 
        self.fc1 = nn.Linear(len(FEATURES),512) #input number and middle layer fc1
        self.fc2 = nn.Linear(512,256) # middle layer fc2
        self.fc3 = nn.Linear(256,2) # output
        
    
    def forward(self,x): 
        x= F.relu(self.fc1(x)) # Put the relu function after fc1
        x= F.relu(self.fc2(x)) # Put the relu function after fc2
        x = self.fc3(x) # fc3
        return x 

## 5.4.1 Definition of Criterion, optimizer

In [None]:
model=Net() # model instance

model.to(device) # if GPU is using, this must be needed. cpu is also OK in this sentence.

criterion = nn.CrossEntropyLoss() # how to calculate loss function

optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # Algo for optimizing weight

In [None]:
model

## 5.4.2 Training one epoch

In [None]:
total_loss = 0 # Initializing total loss

model.train()

for a in train_dataloader:

        train_x= a["X"].to(device)
        train_y = a["Y"].to(device)
        
        
        optimizer.zero_grad() # Set the gradient of optimizer to 0

        output = model(train_x) # prediction
        
        loss = criterion(output,train_y) # calculationg loss between predictions and answers
        
        loss.backward() # backward

        optimizer.step() # optimizing weight

        total_loss += loss.item() # integration of loss
        
        break

In [None]:
output[:3]

In [None]:
torch.max(output.data,1)

In [None]:
torch.max(output.data,1)[1]

## In numpy case, I use this method. ※ This is different in torch ⇔ numpy

In [None]:
output_numpy = output.detach().cpu().numpy() # torch to numpy

In [None]:
out2 = [s.argmax() for s in output_numpy] 

In [None]:
out2[:20]

In [None]:
total_loss

## 5.4.3 Functionalized for 1epoch

In [None]:
def training(train_dataloader,model):

    total_loss = 0 # Initializing total loss
    
    model.train()

    for a in train_dataloader:

            train_x= a["X"].to(device)
            train_y = a["Y"].to(device)
            
            optimizer.zero_grad() # Set the gradient of optimizer to 0

            output = model(train_x) # prediction

            loss = criterion(output,train_y) # calculationg loss between predictions and answers

            loss.backward() # backward

            optimizer.step() # optimizing weight

            total_loss += loss.item() # integration of loss
            
    total_loss = total_loss/len(train_dataloader)
            
    return model,total_loss
        

In [None]:
model,total_loss = training(train_dataloader,model)

In [None]:
total_loss

## 5.4.4 validation

In [None]:
total_loss = 0 # Initializing total loss

model.eval()

for a in val_dataloader:
    
    with torch.no_grad():
        

        val_x= a["X"].to(device)
        val_y = a["Y"].to(device)
        
        output = model(val_x) # prediction
        
        loss = criterion(output,val_y) # calculationg loss between predictions and answers
        
     
        total_loss += loss.item() # integration of loss
        
        break

In [None]:
output[:3]

## 5.4.5 Functionalized for 1epoch

In [None]:
def valeval(val_dataloader,model):
    
    allpreds=[]

    total_loss = 0 # Initializing total loss

    model.eval()

    for a in val_dataloader:

        with torch.no_grad():


            val_x= a["X"].to(device)
            val_y = a["Y"].to(device)

            output = model(val_x) # prediction
            
            allpreds.append(output.detach().cpu().numpy())
            
            loss = criterion(output,val_y) # calculationg loss between predictions and answers


            total_loss += loss.item() # integration of loss
            

    total_loss=total_loss/len(val_dataloader)
    allpreds = np.concatenate(allpreds)
    
    
    return allpreds, total_loss


In [None]:
allpreds,valloss = valeval(val_dataloader,model)

![image.png](attachment:dc37d308-233a-4831-b289-b4bbe1810709.png)

# 5.5     (test) 1000 epoch training and confirm

Initialize model condition

In [None]:
model=Net() # model instance
model.to(device)

criterion = nn.CrossEntropyLoss() # how to calculate loss function
optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # Algo for optimizing weight

In [None]:
all_res = []

for epoch in tqdm(range(1000)):
    
    model,trainloss = training(train_dataloader,model)
    allpreds,valloss = valeval(val_dataloader,model)
    all_res.append([epoch,trainloss,valloss])
 

In [None]:
alldf = pd.DataFrame(all_res)
alldf.columns = ["epoch","trainloss","valloss"]
alldf

In [None]:
alldf.head(10)

In [None]:
plt.plot(alldf["epoch"],alldf["trainloss"])
plt.plot(alldf["epoch"],alldf["valloss"])

## point : early stopping is needed in order to avoid overfitting 

### Accuracy score

In [None]:
train_X

In [None]:
train_preds = model(train_X)
train_preds

In [None]:
train_preds2 = torch.max(train_preds.data,1)[1]
train_preds2[:3]

## python has already Accuracy function

In [None]:
accuracy_score(train_Y,train_preds2)

In [None]:
# functionalize

def calc_accuracy(x,y,model):
    preds = model(x)
    preds2 = torch.max(preds.data,1)[1]
    return accuracy_score(y,preds2)
    
    

In [None]:
calc_accuracy(train_X,train_Y,model)

In [None]:
calc_accuracy(val_X,val_Y,model)

In [None]:
len(train_X)

## Strategy: Save model on best validation score update using calc_accuracy

In [None]:
all_trainloss = []
all_valloss = []

all_trainscore = []
all_valscore = []

allres=[]

bestscore = 0

model=Net() # model instance

criterion = nn.CrossEntropyLoss() # how to calculate loss function

optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # Algo for optimizing weight

for epoch in tqdm(range(1000)):
    
    model,trainloss = training(train_dataloader,model)
    
    preds,valloss = valeval(val_dataloader,model)
    
    trainscore = calc_accuracy(train_X,train_Y,model)
    
    valscore = calc_accuracy(val_X,val_Y,model)
    
    allres.append([epoch,trainloss,valloss,trainscore,valscore])
    
    if bestscore <valscore:
        
        bestscore = valscore
        state = {
                    'state_dict': model.state_dict(),
                    'optimizer_dict': optimizer.state_dict(),
                    "bestscore":bestscore
                }
        

        torch.save(state, "model1.pth")
        
    else:
        pass
        
    
    

In [None]:
bestscore

In [None]:
resdf = pd.DataFrame(allres)
resdf.columns=["epoch","trainloss","valloss","trainscore","valscore"]

In [None]:
resdf

In [None]:
plt.plot(resdf["epoch"],resdf["trainloss"])
plt.plot(resdf["epoch"],resdf["valloss"])

In [None]:
plt.plot(resdf["epoch"],resdf["trainscore"])
plt.plot(resdf["epoch"],resdf["valscore"])

![image.png](attachment:ad7baa01-f679-481c-bc46-4dd1c8649137.png)

# 6.inference for test data

## 6.1 loading model

In [None]:
state = torch.load("./model1.pth")

In [None]:
model.load_state_dict(state["state_dict"])

In [None]:
# confirming submission file

In [None]:
submission = pd.read_csv("../input/titanic/gender_submission.csv")
submission

In [None]:
for a in test_dataloader:
    print(a)
    break

In [None]:
def inference(test_dataloader,model):
    
    allpreds=[]

    total_loss = 0 # Initializing total loss

    model.eval()

    for a in test_dataloader:

        with torch.no_grad():
            val_x= a["X"].to(device)
            val_y = a["Y"].to(device)

            output = model(val_x) # prediction
            
            allpreds.append(output.detach().cpu().numpy())
            
            

    allpreds = np.concatenate(allpreds)
    
    
    return allpreds


In [None]:
allpreds = inference(test_dataloader,model)

In [None]:
allpreds[:3]

In [None]:
allpreds2 = [s.argmax() for s in allpreds]

In [None]:
submission.head(3)

In [None]:
submission["Survived"] = allpreds2

In [None]:
submission.to_csv("submission1.csv",index = False)

![image.png](attachment:aea4a051-cd09-454c-8cab-51f01327657b.png)

# 7. Application : Kfold

In [None]:
kall_preds = []
bestscores=[]

for fold in range(5):
    
    print(f"----fold={fold}---start")


    p_train = folds[folds["fold"] != fold]
    p_val = folds[folds["fold"] == fold]

    # An error will occur later, so reassign the index.
    # 後ほどエラーが出るので、indexを振りなおす。

    p_train = p_train.reset_index(drop=True)
    p_val = p_val.reset_index(drop=True)

    train_dataset = PytorchDataSet(p_train)
    val_dataset = PytorchDataSet(p_val)
    test_dataset = PytorchDataSet(test)

    train_dataloader = DataLoader(train_dataset,batch_size=256,shuffle = True)
    val_dataloader = DataLoader(val_dataset,batch_size=256*2,shuffle = False)
    test_dataloader = DataLoader(test_dataset,batch_size=256*2,shuffle = False)

    model=Net() # model instance

    model.to(device) # if GPU is using, this must be needed. cpu is also OK in this sentence.

    criterion = nn.CrossEntropyLoss() # how to calculate loss function

    optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # Algo for optimizing weight

    all_trainloss = []
    all_valloss = []

    all_trainscore = []
    all_valscore = []

    allres=[]

    bestscore = 0

    model=Net() # model instance

    criterion = nn.CrossEntropyLoss() # how to calculate loss function

    optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # Algo for optimizing weight

    for epoch in tqdm(range(1000)):
        
        model,trainloss = training(train_dataloader,model)
        
        preds,valloss = valeval(val_dataloader,model)
        
        trainscore = calc_accuracy(train_X,train_Y,model)
        
        valscore = calc_accuracy(val_X,val_Y,model)
        
        allres.append([epoch,trainloss,valloss,trainscore,valscore])
        
        if bestscore <valscore:
            
            bestscore = valscore
            state = {
                        'state_dict': model.state_dict(),
                        'optimizer_dict': optimizer.state_dict(),
                        "bestscore":bestscore
                    }
            

            torch.save(state, f"model{fold}.pth")
            
        else:
            pass
            
    bestscores.append(bestscore)

## 7.1 Kfold inference ( this part can be included in #7 )

In [None]:
kall_preds = []

for fold in range(5):
    
    state = torch.load(f"./model{fold}.pth")

    model.load_state_dict(state["state_dict"])

    allpreds = inference(test_dataloader,model)
    
    kall_preds.append(allpreds)
    
    

In [None]:
bestscores

In [None]:
np.mean(bestscores)

In [None]:
len(kall_preds)

In [None]:
kall_preds = np.mean(kall_preds,axis=0)
kall_preds = [s.argmax() for s in kall_preds]

In [None]:
submission["Survived"] = kall_preds

In [None]:
submission.to_csv("submission2.csv",index=False)

In [None]:
submission