# 機能要件

* 済　2値の分類（Classification）タスクを扱える
* 済 カテゴリカル変数を指定するとone-hotエンコードを実行する
* 済 モデル用データマートに施したのと同一データ前処理をスコア用データマートに対しても適用される
* 済 モデル選択の評価指標を選択できる
* 済 複数アルゴリズムから指定の評価指標に従いベストモデルを選択できる
* 済　学習済みモデルを保存できる
* 済　アルゴリズムランキングと性能評価指標が出力される
* 済 学習済みモデル（保存したモデル）を呼び出しスコア用データに対し予測確率を付与できる


# 開発課題認定プロセス（最終日にその場で実行・提出）
* 訓練用データと（正解データの無い）検証用データを配布します
* データ形式は第1カラムがID、第2カラムがクラス変数、第3カラム以降が特徴ベクトルの構成です
  * 検証用データの第2カラムは全て空白
* クラス変数は0/1のバイナリ値で予測確率を知りたいクラスは”1”とする
* ID（第1カラム）と予測確率（第2カラム）の２カラム構成の結果ファイルをCSV形式で提出（ヘッダーあり）してもらいます


## 関数
* モデルのコンストラクタ
 * 引数：カテゴリカル変数指定, 評価指標
 * 戻り値：自動モデル器
* fit
 * 引数：X, y (訓練データ)
 * 戻り値：~~学習済みモデル~~ なし
* predict
 * 引数:X
 * 戻り値:予測結果


In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC # SVCは非線形SVM（Support Vector Classifier）
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# ファイル入出力ライブラリ
import pickle

In [27]:
class AutoML():
    def __init__(self, categorical_columns, eval_kind):
        self.categorical_columns = categorical_columns
        self.eval_kind = eval_kind

        # 2値分類ができるモデルをパイプラインで作成
        self.pipelines = {
            'knn':
                Pipeline([('scl',StandardScaler()), ('est',KNeighborsClassifier())]),
            'logistic':
                Pipeline([('scl',StandardScaler()), ('est',LogisticRegression(random_state=1))]),
            'rsvc':
                Pipeline([('scl',StandardScaler()), ('est',SVC(C=1.0,kernel='rbf',class_weight='balanced',random_state=1, probability=True))]),
#             'lsvc':
#                 Pipeline([('scl',StandardScaler()), ('est',LinearSVC(C=1.0,class_weight='balanced',random_state=1))]),
            'rf':
                Pipeline([('scl',StandardScaler()), ('est',RandomForestClassifier(random_state=1))]),
            'gb':
                Pipeline([('scl',StandardScaler()), ('est',GradientBoostingClassifier(random_state=1))]),
            'mlp':
                Pipeline([('scl',StandardScaler()), ('est',MLPClassifier(hidden_layer_sizes=(5,3), max_iter=500, random_state=1))])
        }
        
    def read_data_file(self, file_path):
        # objectで読み込むのがポイント！ 'Dependents_2'が'Dependents_2.0'となることを避けられる
        dtype = {column: object for column in self.categorical_columns}
        df = pd.read_csv(file_path, header=0, dtype=dtype)

        X  = df.iloc[:,2:]            # 3列目以降を特徴量
        ID = df.iloc[:,[0]]             # 第0列はPK（Loan_ID）なのでIDとしてセット
        y  = df.iloc[:,1]              # 2列目をクラス変数
        
        self.ID_name = ID.columns[0]
        self.y_name = y.name
        
        return X, y, ID
    
    def fit(self, X, y):
        # データ前処理
        X_pre = self.__preprocess_X(X)
        y_pre = self.__preprocess_y(y)
        
        # one-hotエンコーディングヘッダをテストデータにも適用するため保持
        self.X_columns = X_pre.columns.values
#         
#         # デバッグ文
#         print('---X_columns start------------')
#         display(self.X_columns)
#         print('---X_columns end------------')
        
        # 全てのモデルでfitを行う（cross-validationで検証する）
        # fit & evaluation
        scores = {}
        for pipe_name, pipeline in self.pipelines.items():
            pipeline.fit(X_pre, y_pre)
            cv_results = cross_val_score(pipeline,
                             X_pre,
                             y_pre.values,
                             cv=5,
                             scoring=self.eval_kind)

            # cross-validationの平均値-標準偏差をスコアとする
            scores[pipe_name] = cv_results.mean() - cv_results.std()
            sorted_scores = self.__sort_dictionary(scores)

        # アルゴリズムランキングと性能指標評価を出力
        display(pd.Series(sorted_scores))

        # eval_kindを用いて評価する
        # 一番評価の良かったモデルを取っておく
        best_algorithm = [*sorted_scores][0]
#         print('best:{}'.format(best_algorithm))
        self.best_model = self.pipelines[best_algorithm]
    
    # X前処理
    def __preprocess_X(self, X):
        # Xのone-hotエンコーディング
        X_ohe = pd.get_dummies(X, dummy_na=True, columns=self.categorical_columns)

        # 欠損値を平均で置き換える
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(X_ohe)
        X_ohe_columns = X_ohe.columns.values
        X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)
        
        # デバッグ文
        print('preprocess result-----')
        display(X_ohe)
        print('----------------------')
        
        return X_ohe

    # y前処理
    def __preprocess_y(self, y):
        # yは欠損値を最頻値で置き換える
        y_pre = y.fillna(y.mode()[0])
#         # デバッグ文
#         print('preprocess result-----')
#         display(y_pre)
#         print('----------------------')
        return y_pre
    
    def __sort_dictionary(self, dict):
        sorted_dict = {}
        for k, v in sorted(dict.items(), key=lambda x: -x[1]):
            sorted_dict[k] = v
        
        return sorted_dict
    
    def __get_score(self, y, y_predict):
        if self.eval_kind == 'accuracy':
            return accuracy_score(y, y_predict)
        if self.eval_kind == 'precision':
            return precision_score(y, y_predict)
        if self.eval_kind == 'recall':
            return recall_score(y, y_predict)
        if self.eval_kind == 'f1':
            return f1_score(y, y_predict)

    def __preprocess_test(self, X_test):
        # データ前処理
        X_test_pre = self.__preprocess_X(X_test)

        # 訓練データのヘッダから空データフレーム作成
        df_cols_m = pd.DataFrame(None,
                         columns=self.X_columns,
                         dtype=float)
#         # デバッグ文
#         print('---df_cols_m start--------------------')
#         display(df_cols_m)
#         print('---df_cols_m end----------------------')
        
        # テストデータの列を訓練データに合わせる
        X_test_concat = pd.concat([df_cols_m, X_test_pre])

#         # デバッグ文
#         display(X_test_concat)
        
        # 訓練データにない列を削除
        X_test_drop = X_test_concat.drop(list(set(X_test_concat.columns.values)-set(self.X_columns)),axis=1)
        

        # テストデータに登場しなかったデータ項目をゼロ埋め
        X_test_drop.loc[:,list(set(self.X_columns)-set(X_test_pre.columns.values))] = \
            X_test_drop.loc[:,list(set(self.X_columns)-set(X_test_pre.columns.values))].fillna(0, axis=1)

#         # デバッグ文
#         print('---X_test_drop start--------------------')
#         display(X_test_drop)
#         print('---X_test_drop end----------------------')

        # 訓練データと合わせて並び替え
        X_test_drop_reindex = X_test_drop.reindex(self.X_columns, axis=1)

        # デバッグ文
        print('---X_test_drop_reindex start--------------------')
        display(X_test_drop_reindex)
        print('---X_test_drop_reindex end----------------------')

        return X_test_drop_reindex
        
    
    def predict(self, X_test):
        # データ前処理
        X_test_pre_complete = self.__preprocess_test(X_test)
        
        return self.best_model.predict(X_test_pre_complete)
                
    def predict_proba(self, X_test):
        # データ前処理
        X_test_pre_complete = self.__preprocess_test(X_test)

        return self.best_model.predict_proba(X_test_pre_complete)

    def predict_proba_with_id(self, id, X_test):
        proba = self.predict_proba(X_test)
        
        # 2列目が'1'の予測確率
        proba_1 = proba[:, 1]
        
        # IDと予測確率の結合
        ID_array = ID_test.values
        ID_array_1dim = ID_array[:,0]
        result = np.vstack((ID_array_1dim, proba_1))
        result_df = pd.DataFrame(result).T
        
        # ヘッダをつける
        result_df.columns = [self.ID_name, self.y_name]
        
        return result_df

    # file_nameにAutoMLごとベストモデルを保存する
    def save(self, file_name):
        with open(file_name, mode='wb') as f:
            pickle.dump(self, f)

    # file_nameから学習済みモデルを保持したAutoMLを呼び出す
    def load(self, file_name):
        with open(file_name, mode='rb') as f:
            return pickle.load(f)

# Auto MLの生成

In [28]:
# カテゴリカル変数
categorical_columns = ['sales',
                       'salary']

In [29]:
auto_ml = AutoML(categorical_columns, eval_kind='accuracy')
# auto_ml = AutoML(categorical_columns, eval_kind='precision')
# auto_ml = AutoML(categorical_columns, eval_kind='recall')
# auto_ml = AutoML(categorical_columns, eval_kind='f1')

# データ読み込み

In [30]:
# 本番データ
X, y, ID = auto_ml.read_data_file('data/final_hr_analysis_train.csv')

# 欠損値
# X, y, ID = auto_ml.read_data_file('data/final_hr_analysis_train_lack.csv')

# クラス変数欠損
# X, y, ID = auto_ml.read_data_file('data/final_hr_analysis_train_class_lack.csv')

# カテゴリデータがTestと異なる
# X, y, ID = auto_ml.read_data_file('data/final_hr_analysis_train_category.csv')

In [31]:
display(X.head(5))
display(y.head(5))
display(ID.head(5))

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary
0,50,10,4.0,180.0,10.0,4.0,3,0.0,600
1,80,8,6.0,160.0,9.0,5.0,4,0.0,900
2,67,5,7.0,150.0,,1.0,2,,500
3,75,9,5.0,200.0,20.0,3.0,3,0.0,600
4,80,5,2.0,210.0,8.0,,1,1.0,400


0    0
1    0
2    0
3    0
4    0
Name: left, dtype: int64

Unnamed: 0,index
0,A01
1,A02
2,A03
3,A04
4,A05


# 試験時は、0/1で表されるので、この変換は不要

In [32]:
# class_mapping = {'N':1, 'Y':0}
# y = y.map(class_mapping)
# display(y.head(3))

# Model生成（fit）

In [33]:
best_model = auto_ml.fit(X, y)

preprocess result-----




Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_0,sales_1,sales_2,sales_nan,salary_400,salary_500,salary_600,salary_700,salary_800,salary_900,salary_nan
0,50.0,10.0,4.0,180.0,10.0,4.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,80.0,8.0,6.0,160.0,9.0,5.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,67.0,5.0,7.0,150.0,7.052632,1.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,75.0,9.0,5.0,200.0,20.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,80.0,5.0,2.0,210.0,8.0,4.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,60.0,6.0,4.0,150.0,6.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,95.0,5.0,5.0,180.0,7.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,100.0,7.0,6.0,160.0,9.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,88.0,8.0,5.263158,150.0,6.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,56.0,8.0,6.0,180.0,7.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0




----------------------




gb          0.724062
logistic    0.677000
rf          0.677000
rsvc        0.610788
mlp         0.547846
knn         0.494754
dtype: float64

# テストデータ読み込み

In [34]:
# 本番データ
X_test, y_test, ID_test = auto_ml.read_data_file('data/final_hr_analysis_test.csv')
# 欠損値
# X_test, y_test, ID_test = auto_ml.read_data_file('data/final_hr_analysis_test_lack.csv')
# カテゴリデータが訓練データと異なる
# X_test, y_test, ID_test = auto_ml.read_data_file('data/final_hr_analysis_test_category.csv')

In [35]:
display(X_test.head(3))
display(y_test.head(3))
display(ID_test.head(3))

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales,salary
0,48,10,4,191,10,4,3,0,600
1,88,8,6,158,9,5,4,0,500
2,98,5,7,143,12,1,2,1,400


0   NaN
1   NaN
2   NaN
Name: left, dtype: float64

Unnamed: 0,index
0,B01
1,B02
2,B03


# 予測（predict）

In [36]:
auto_ml.predict(X_test)
# auto_ml.predict(X)

preprocess result-----




Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_-1,sales_0,sales_1,sales_nan,salary_200,salary_300,salary_400,salary_500,salary_590,salary_600,salary_700,salary_nan
0,48.0,10.0,4.0,191.0,10.0,4.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,88.0,8.0,6.0,158.0,9.0,5.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,98.0,5.0,7.0,143.0,12.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,59.0,9.0,5.0,202.0,20.0,3.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,87.0,5.0,2.0,208.0,8.0,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,93.0,6.0,4.0,160.0,6.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,90.0,5.0,5.0,173.0,7.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,96.0,7.0,6.0,145.0,9.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,84.0,8.0,5.0,150.0,6.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,53.0,8.0,6.0,180.0,7.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


----------------------
---X_test_drop_reindex start--------------------


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_0,sales_1,sales_2,sales_nan,salary_400,salary_500,salary_600,salary_700,salary_800,salary_900,salary_nan
0,48.0,10.0,4.0,191.0,10.0,4.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,88.0,8.0,6.0,158.0,9.0,5.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,98.0,5.0,7.0,143.0,12.0,1.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,59.0,9.0,5.0,202.0,20.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,87.0,5.0,2.0,208.0,8.0,4.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,93.0,6.0,4.0,160.0,6.0,3.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,90.0,5.0,5.0,173.0,7.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,96.0,7.0,6.0,145.0,9.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,84.0,8.0,5.0,150.0,6.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,53.0,8.0,6.0,180.0,7.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


---X_test_drop_reindex end----------------------


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

# 予測確率（predict_proba）

In [37]:
proba = auto_ml.predict_proba_with_id(X_test=X_test, id=ID_test)
# proba_train = auto_ml.predict_proba_with_id(X_test=X, id=ID)

preprocess result-----




Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_-1,sales_0,sales_1,sales_nan,salary_200,salary_300,salary_400,salary_500,salary_590,salary_600,salary_700,salary_nan
0,48.0,10.0,4.0,191.0,10.0,4.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,88.0,8.0,6.0,158.0,9.0,5.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,98.0,5.0,7.0,143.0,12.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,59.0,9.0,5.0,202.0,20.0,3.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,87.0,5.0,2.0,208.0,8.0,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,93.0,6.0,4.0,160.0,6.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,90.0,5.0,5.0,173.0,7.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,96.0,7.0,6.0,145.0,9.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,84.0,8.0,5.0,150.0,6.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,53.0,8.0,6.0,180.0,7.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


----------------------
---X_test_drop_reindex start--------------------


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_0,sales_1,sales_2,sales_nan,salary_400,salary_500,salary_600,salary_700,salary_800,salary_900,salary_nan
0,48.0,10.0,4.0,191.0,10.0,4.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,88.0,8.0,6.0,158.0,9.0,5.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,98.0,5.0,7.0,143.0,12.0,1.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,59.0,9.0,5.0,202.0,20.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,87.0,5.0,2.0,208.0,8.0,4.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,93.0,6.0,4.0,160.0,6.0,3.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,90.0,5.0,5.0,173.0,7.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,96.0,7.0,6.0,145.0,9.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,84.0,8.0,5.0,150.0,6.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,53.0,8.0,6.0,180.0,7.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


---X_test_drop_reindex end----------------------


In [38]:
proba
# proba_train.head(3)

Unnamed: 0,index,left
0,B01,0.000227842
1,B02,0.000227842
2,B03,0.000227842
3,B04,0.000227842
4,B05,0.000227842
5,B06,0.000245569
6,B07,0.000227842
7,B08,0.000227842
8,B09,0.000245569
9,B10,0.000227842


In [39]:
proba.to_csv('aijc1303.csv', index=False)

In [40]:
auto_ml.save('best_model')

# 学習済みモデル（保存したモデル）を呼び出し

In [41]:
load_model = auto_ml.load('best_model')

# スコア用データに対し予測確率を付与できる

In [42]:
load_model.predict_proba_with_id(id=ID_test, X_test=X_test)

preprocess result-----




Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_-1,sales_0,sales_1,sales_nan,salary_200,salary_300,salary_400,salary_500,salary_590,salary_600,salary_700,salary_nan
0,48.0,10.0,4.0,191.0,10.0,4.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,88.0,8.0,6.0,158.0,9.0,5.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,98.0,5.0,7.0,143.0,12.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,59.0,9.0,5.0,202.0,20.0,3.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,87.0,5.0,2.0,208.0,8.0,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,93.0,6.0,4.0,160.0,6.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,90.0,5.0,5.0,173.0,7.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,96.0,7.0,6.0,145.0,9.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,84.0,8.0,5.0,150.0,6.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,53.0,8.0,6.0,180.0,7.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


----------------------
---X_test_drop_reindex start--------------------


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,sales_0,sales_1,sales_2,sales_nan,salary_400,salary_500,salary_600,salary_700,salary_800,salary_900,salary_nan
0,48.0,10.0,4.0,191.0,10.0,4.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,88.0,8.0,6.0,158.0,9.0,5.0,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,98.0,5.0,7.0,143.0,12.0,1.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,59.0,9.0,5.0,202.0,20.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,87.0,5.0,2.0,208.0,8.0,4.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,93.0,6.0,4.0,160.0,6.0,3.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,90.0,5.0,5.0,173.0,7.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,96.0,7.0,6.0,145.0,9.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8,84.0,8.0,5.0,150.0,6.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9,53.0,8.0,6.0,180.0,7.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


---X_test_drop_reindex end----------------------


Unnamed: 0,index,left
0,B01,0.000227842
1,B02,0.000227842
2,B03,0.000227842
3,B04,0.000227842
4,B05,0.000227842
5,B06,0.000245569
6,B07,0.000227842
7,B08,0.000227842
8,B09,0.000245569
9,B10,0.000227842
