In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#Undersampling
def create_undersampling_df(primary_df,ans_df):
    true_len=ans_df.sum()
    split_num=(len(ans_df)//true_len)-1
    out_df=[]
    out_ans=[]
    last_idx=0
    is_use=[0]
    for i in range(split_num):
        is_use=copy.deepcopy(ans_df)
        now_len=0
        for j in range(last_idx,len(ans_df)):
            if now_len==true_len:
                last_idx=j
                break
            if ans_df[j]==0:
                is_use[j]=1
                now_len+=1
        out_df.append(primary_df.loc[is_use==1,:])
        out_ans.append(ans_df.loc[is_use==1])
    return {"df":out_df,"ans":out_ans}

def reshape_df(df):
    #idは意味を持たないため削除
    df.drop('id',axis=1,inplace=True)

    #print(train_df["StandardHours"].value_counts())
    #80    1200
    #Name: StandardHours, dtype: int64
    df.drop('StandardHours',axis=1,inplace=True)

    #print(train_df["Over18"].value_counts())
    #Y    1200
    #Name: Over18, dtype: int64
    df.drop('Over18',axis=1,inplace=True)

    #https://deepage.net/features/pandas-replace.html
    df=df.replace({
        #BusinessTravel
        'Non-Travel':0,
        'Travel_Rarely':1,
        'Travel_Frequently':2,
        #Gender
        'Male':0,
        'Female':1,
        #OverTime
        'No':0,
        'Yes':1,
    })

    #https://note.nkmk.me/python-pandas-get-dummies/
    df=pd.get_dummies(df, columns=["Department", "EducationField","JobRole","MaritalStatus"],sparse=True)
    return df

import scipy.stats
import torch
from torch.utils.data import DataLoader, TensorDataset

def preprocess(x_df,y_df):
    #https://note.nkmk.me/python-list-ndarray-dataframe-normalize-standardize/
    x=scipy.stats.zscore(x_df.values,ddof=1)
    x=torch.from_numpy(x).float()
    y=torch.from_numpy(y_df.values).long()
    return (x,y)

from sklearn.model_selection import train_test_split
def do_all_preprocess(primary_df,ans_df):
    #
    primary_df=reshape_df(primary_df)
    width=len(primary_df.iloc[0,:])
    height=len(primary_df.iloc[:,0])
    #
    train_x,test_x,train_y,test_y = train_test_split(primary_df,ans_df,test_size = 0.1)
    #
    train_x,train_y=preprocess(primary_df,ans_df)
    test_x,test_y=preprocess(test_x,test_y)
    #
    train_dataset=TensorDataset(train_x,train_y)
    train_loader=DataLoader(train_dataset, shuffle=True)
    test_dataset=TensorDataset(test_x,test_y)
    test_loader=DataLoader(test_dataset, shuffle=True)
    return train_loader,test_loader#,width,height

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#https://techtech-sorae.com/pytorch%E3%82%92%E7%94%A8%E3%81%84%E3%81%A6%E3%83%87%E3%82%A3%E3%83%BC%E3%83%97%E3%83%A9%E3%83%BC%E3%83%8B%E3%83%B3%E3%82%B0%E3%81%AB%E3%82%88%E3%82%8B%E3%83%AF%E3%82%A4%E3%83%B3%E5%88%86%E9%A1%9E/
# ネットワークのモジュール化
class Net(nn.Module):
    def __init__(self, input):
        super(Net, self).__init__()
        
        # ネットワークを定義
        self.fc1 = nn.Linear(input, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc4 = nn.Linear(128, 128)
        self.fc5 = nn.Linear(128, 128)
        self.fc6 = nn.Linear(128, 2)
        self.relu = nn.ReLU()
        self.softmax=nn.Softmax()

    # 順伝搬を定義
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu( self.fc2(x))
        x = self.relu( self.fc3(x))
        x = self.relu( self.fc4(x))
        x = self.relu( self.fc5(x))
        x = self.fc6(x)
        x=self.softmax(x)
        return x

from numpy.ma.core import count
from datetime import datetime
from pytz import timezone
import matplotlib.pyplot as plt

def train_process(Net,train_loader,test_loader):
    model = Net(47)
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion=nn.NLLLoss()

    epoch_num=250
    loss_list=[]*epoch_num

    model.train()
    for i in range(epoch_num):
        total_loss = 0
        for x,y in train_loader:
            optimizer.zero_grad()
            pred_y = model(x)
            loss = criterion(pred_y, y)
            loss.backward()
            optimizer.step()
            total_loss+=loss.item()
        if i%50==0:
            print("epoch "+str(i)+" "+datetime.now(timezone('Asia/Tokyo')).strftime('%m/%d %H:%M:%S')+" total_loss:"+str(total_loss))
            loss_list.append(total_loss)
    plt.plot(loss_list)
    plt.show()
    #
    model.eval()
    pred_y_list=[]
    y_list=[]
    with torch.no_grad(): 
        for x,y in test_loader:
            pred_y = model(x)
            y_list.append(y.item())
            pred_y_list.append(torch.argmax(pred_y).item())
    
    return model,y_list,pred_y_list

In [4]:
import pandas as pd
import copy

primary_df=pd.read_csv('/content/drive/MyDrive/signate/従業員の離職予測/train.csv')
ans_df=primary_df.loc[:,'Attrition']
primary_df.drop('Attrition',axis=1,inplace=True)
output=create_undersampling_df(primary_df,ans_df)
dfs=output["df"]
ans_dfs=output["ans"]

In [5]:
train_loaders=[]
test_loaders=[]
for i in range(len(dfs)):
    train_loader,test_loader=do_all_preprocess(dfs[i],ans_dfs[i])
    train_loaders.append(train_loader)
    test_loaders.append(test_loader)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
models=[]
true_ys=[]
pred_ys=[]
for i in range(len(train_loaders)):
    model,true_y,pred_y=train_process(Net,train_loaders[i],test_loaders[i])
    models.append(model)
    true_ys.append(true_y)
    pred_ys.append(pred_y)

In [8]:
from sklearn.metrics import confusion_matrix
for i in range(len(true_ys)):
    print(confusion_matrix(true_ys[i],pred_ys[i], labels=[1, 0]))

[[ 0 19]
 [ 0 24]]
[[ 0 21]
 [ 0 22]]
[[19  4]
 [ 0 20]]
[[ 0 22]
 [ 0 21]]


In [20]:
test_df=pd.read_csv('/content/drive/MyDrive/signate/従業員の離職予測/test.csv')
test_df=reshape_df(test_df)
width=len(test_df.iloc[0,:])
height=len(test_df.iloc[:,0])
#
x=scipy.stats.zscore(test_df.values,ddof=1)
x=torch.from_numpy(x).float()
#
model.eval()
test_pred_ys=[]
pred_y=models[2](x)

import numpy as np
from datetime import datetime
from pytz import timezone
test_df=pd.read_csv('/content/drive/MyDrive/signate/従業員の離職予測/test.csv')
data=np.array([test_df["id"].values,torch.argmax(pred_y,dim=1).tolist()],dtype='int16')
data=pd.DataFrame(data.T)
data.to_csv('/content/drive/MyDrive/signate/従業員の離職予測/out'+datetime.now(timezone('Asia/Tokyo')).strftime('%m%d_%H%M')+'.csv',index=False,header=False)



In [None]:
test_df=pd.read_csv('/content/drive/MyDrive/signate/従業員の離職予測/test.csv')
test_df=reshape_df(test_df)
width=len(test_df.iloc[0,:])
height=len(test_df.iloc[:,0])
#
x=scipy.stats.zscore(test_df.values,ddof=1)
x=torch.from_numpy(x).float()
#
model.eval()
test_pred_ys=[]
for i in range(len(models)):
    pred_y=models[i](x)
    test_pred_ys.append(pred_y)

total_pred_y=0
for i in range(len(models)):
    total_pred_y+=test_pred_ys[i]
total_ans=torch.argmax(total_pred_y,dim=1)
print(total_ans)

In [30]:
adjust_pred_y=[]
for p0,p1 in (np.array(total_pred_y.tolist())*100/4).tolist():
    if p0>70:
        adjust_pred_y.append(0)
    else:
        adjust_pred_y.append(1)
import numpy as np
from datetime import datetime
from pytz import timezone
test_df=pd.read_csv('/content/drive/MyDrive/signate/従業員の離職予測/test.csv')
data=np.array([test_df["id"].values,adjust_pred_y],dtype='int16')
data=pd.DataFrame(data.T)
data.to_csv('/content/drive/MyDrive/signate/従業員の離職予測/out'+datetime.now(timezone('Asia/Tokyo')).strftime('%m%d_%H%M')+'.csv',index=False,header=False)

In [12]:
import numpy as np
from datetime import datetime
from pytz import timezone
test_df=pd.read_csv('/content/drive/MyDrive/signate/従業員の離職予測/test.csv')
data=np.array([test_df["id"].values,total_ans.tolist()],dtype='int16')
data=pd.DataFrame(data.T)
data.to_csv('/content/drive/MyDrive/signate/従業員の離職予測/out'+datetime.now(timezone('Asia/Tokyo')).strftime('%m%d_%H%M')+'.csv',index=False,header=False)