In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# 读取数据

In [None]:
aug_data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv', index_col='enrollee_id')
aug_data = aug_data.sort_index()
aug_data

# 数据缺失情况

In [None]:
aug_data.isnull().sum()

# 填充缺失数据

In [None]:
print("gender：", aug_data.gender.unique(), '\n')
print("enrolled_university：", aug_data.enrolled_university.unique(), '\n')
print("education_level：", aug_data.education_level.unique(), '\n')
print("major_discipline：", aug_data.major_discipline.unique(), '\n')
print("experience：", aug_data.experience.unique(), '\n')
print("company_size：", aug_data.company_size.unique(), '\n')
print("company_type：", aug_data.company_type.unique(), '\n')
print("last_new_job：", aug_data.last_new_job.unique(), '\n')

In [None]:
def fill_null_data(df):
    df.gender = df.gender.fillna('Other')
    df.enrolled_university = df.enrolled_university.fillna('Unknown')
    df.education_level = df.education_level.fillna('Unknown')
    df.major_discipline = df.major_discipline.fillna('Unknown')
    df.experience = df.experience.fillna('Unknown')
    df.company_size = df.company_size.fillna('Unknown')
    df.company_type = df.company_type.fillna('Unknown')
    df.last_new_job = df.last_new_job.fillna('Unknown')
    
fill_null_data(aug_data)

In [None]:
aug_data.isnull().sum()

# 处理数据

In [None]:
y = aug_data.target.astype('int')
X = aug_data.drop('target', axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

print('X_train: ', X_train.shape, 'y_train: ', y_train.shape, '\nX_val: ', X_val.shape,  'y_val: ', y_val.shape)

# 定义模型

In [None]:
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce

# catboost-encoder
catboost_encoder = ce.CatBoostEncoder(cols=X.columns)
catboost_encoder.fit(X_train, y_train)
    
def catboost_encode_x_data(x_data):
    encoder_x_data = x_data.copy()
    encoder_x_data = catboost_encoder.transform(x_data)
    encoder_x_data.index = x_data.index
    return encoder_x_data

encoder_X_train = catboost_encode_x_data(X_train)
encoder_X_val = catboost_encode_x_data(X_val)

# 训练模型

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn import svm
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

def calc_score(model):
    scores = -1 * cross_val_score(model, encoder_X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    print("MSE score:", scores.mean())

print("SVR ->")
calc_score(svm.SVR())
print("XGBRegressor ->")
calc_score(XGBRegressor())
print("SGDRegressor ->")
calc_score(SGDRegressor())
print("GradientBoostingRegressor ->")
calc_score(GradientBoostingRegressor())
print("KNeighborsRegressor ->")
calc_score(KNeighborsRegressor())
print("MLPRegressor ->")
calc_score(MLPRegressor())

In [None]:
model = GradientBoostingRegressor()
model.fit(encoder_X_train, y_train)
y_val_predict = model.predict(encoder_X_val)
error = mean_squared_error(y_val, y_val_predict)
print("MSE: ", error)

# 预测模型

In [None]:
X_test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv', index_col='enrollee_id')

fill_null_data(X_test)

X_test

In [None]:
encoder_X_test = catboost_encode_x_data(X_test)
y_test_predict = model.predict(encoder_X_test)

submit_data = pd.DataFrame({'label': y_test_predict}, index=X_test.index)
submit_data.to_csv('submission.csv')

!head submission.csv