# ロジスティック回帰モデルの作成

In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
import pickle

##　データの読み込み

In [2]:
data_preprocessed = pd.read_csv("Absenteeism_preprocessed.csv")

In [3]:
data_preprocessed .head()

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## ターゲットの作成

In [4]:
data_preprocessed["Absenteeism Time in Hours"].median()

3.0

In [5]:
# 汎用性考慮
targets = np.where(data_preprocessed["Absenteeism Time in Hours"] > 
                   data_preprocessed["Absenteeism Time in Hours"].median(), 1, 0)

data_preprocessed["Excessive Absenteeism"] = targets

In [6]:
data_preprocessed

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0,2,0


## データ割合の確認・作成

In [7]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [8]:
data_with_targets = data_preprocessed.drop(["Absenteeism Time in Hours", "Daily Work Load Average", "Distance to Work"], axis=1)

unscaled_inputs = data_with_targets.iloc[:, :-1]

## データの標準化・分割

In [9]:
# 標準化
absenteeism_scaler = StandardScaler()

absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [10]:
# カテゴリー変数は標準化から外す
class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]


In [11]:
unscaled_inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Age',
       'Body Mass Index', 'Education', 'Children', 'Pets'], dtype=object)

In [12]:
#columns_to_scale = ['Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work',
#      'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
#      'Children', 'Pets']

columns_to_omit = ['Reason1', 'Reason2', 'Reason3', 'Reason4', "Daily Work Load Average", "Distance to Work"]
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

absenteeism_scaler = CustomScaler(columns_to_scale)
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [13]:
scaled_inputs

Unnamed: 0,Reason1,Reason2,Reason3,Reason4,Month Value,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,2.130803,1.002633,-0.447980,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,-0.654143,0.248310,1.002633,-0.447980,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,0.854936,0.405184,-0.643782,-0.447980,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,1.005844,-0.536062,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.007725,-0.654143,0.562059,-1.114186,2.232242,0.880469,-0.589690
696,1,0,0,0,-0.388293,-0.007725,0.040034,-1.320435,-0.643782,-0.447980,-0.019280,1.126663
697,1,0,0,0,-0.388293,0.668253,1.624567,-1.320435,-0.408580,2.232242,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.668253,0.190942,-0.692937,-0.408580,2.232242,-0.919030,-0.589690


In [14]:
# 分割
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

## モデル訓練

In [15]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [16]:
# 正確性・切片・重みの確認
print(reg.score(x_train, y_train), reg.intercept_, reg.coef_)

0.775 [-1.68909209] [[ 2.80062254  0.93512933  3.09812821  0.85396822  0.16636314 -0.08504146
   0.61123775 -0.16417463  0.26880304 -0.08070247  0.36072307 -0.2863671 ]]


In [17]:
unscaled_inputs.columns.values

array(['Reason1', 'Reason2', 'Reason3', 'Reason4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Age',
       'Body Mass Index', 'Education', 'Children', 'Pets'], dtype=object)

In [18]:
#DataFrameの作成
feature_name = unscaled_inputs.columns.values

# 重みの追加
summary_table = pd.DataFrame(columns=["Feature name"], data=feature_name)
summary_table["Coefficient"] = np.transpose(reg.coef_)

# 切片の追加
summary_table.index = summary_table.index+1
summary_table.loc[0] = ["Intercept", reg.intercept_[0]]
summary_tabel = summary_table.sort_index()
summary_tabel

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.689092
1,Reason1,2.800623
2,Reason2,0.935129
3,Reason3,3.098128
4,Reason4,0.853968
5,Month Value,0.166363
6,Day of the Week,-0.085041
7,Transportation Expense,0.611238
8,Age,-0.164175
9,Body Mass Index,0.268803


## オッズ比

In [19]:
summary_table["Odds_ration"] = np.exp(summary_table.Coefficient)
summary_table.sort_values("Odds_ration", ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ration
3,Reason3,3.098128,22.15644
1,Reason1,2.800623,16.454887
2,Reason2,0.935129,2.547543
4,Reason4,0.853968,2.34895
7,Transportation Expense,0.611238,1.842711
11,Children,0.360723,1.434366
9,Body Mass Index,0.268803,1.308397
5,Month Value,0.166363,1.181002
10,Education,-0.080702,0.922468
6,Day of the Week,-0.085041,0.918474


## モデルのテスト

In [23]:
reg.score(x_test, y_test)

0.7428571428571429

In [25]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba[:, 1]

array([0.26373731, 0.39160367, 0.58860623, 0.19676771, 0.92764658,
       0.68351633, 0.68714466, 0.86660494, 0.20233182, 0.24943004,
       0.51894341, 0.80186969, 0.92245313, 0.28939733, 0.69397129,
       0.42448726, 0.45966739, 0.42882618, 0.61862957, 0.95159492,
       0.30167924, 0.20601201, 0.60429258, 0.57649737, 0.73397239,
       0.2433699 , 0.48602119, 0.1311796 , 0.80058986, 0.21557405,
       0.37388233, 0.68672223, 0.69157426, 0.54259813, 0.20601201,
       0.50736116, 0.21010621, 0.74391665, 0.43587519, 0.59254326,
       0.22720279, 0.43378738, 0.2163992 , 0.39603995, 0.81547127,
       0.57038357, 0.69134841, 0.2750505 , 0.20235228, 0.18218452,
       0.59170996, 0.34222359, 0.67095173, 0.28613944, 0.85018117,
       0.47112344, 0.89016876, 0.25488056, 0.31794827, 0.31578629,
       0.71918305, 0.66038142, 0.31050759, 0.78748791, 0.19823823,
       0.26763444, 0.08235605, 0.22982263, 0.72858899, 0.32997026,
       0.21010621, 0.2943376 , 0.90845154, 0.43773179, 0.61937

## モデルの保存

In [29]:
with open("model", "wb") as file:
    pickle.dump(reg, file)

with open("scaler", "wb") as file:
    pickle.dump(absenteeism_scaler, file)