In [1]:
import pandas as pd
import numpy as np
import os
import gc
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('DStrain.csv')
test = pd.read_csv('DStest.csv')

In [3]:
train['num_missing'] = train.isnull().sum(axis=1)
test['num_missing'] = test.isnull().sum(axis=1)

In [4]:
missing_columns = ['gender','enrolled_university','education_level','major_discipline','experience','company_size','company_type','last_new_job']
for col in missing_columns:
    train[col].fillna('NA',inplace=True)
    test[col].fillna('NA',inplace=True)

In [5]:
#FEATURE ENGINEERING - based on higher than mean and median of training hours within each category
def th_features(column,train=train,test=test):
    gp_median = train.groupby(column)['training_hours'].median().reset_index().rename(columns={'training_hours':column+'_th_median'})
    gp_mean = train.groupby(column)['training_hours'].mean().reset_index().rename(columns={'training_hours':column+'_th_mean'})
    train = train.merge(gp_median,on=column,how='left')
    train = train.merge(gp_mean,on=column,how='left')
    test = test.merge(gp_median,on=column,how='left')
    test = test.merge(gp_mean,on=column,how='left')
    train[column+'_higher_th_median_ind'] = train.training_hours > train[column+'_th_median']
    train[column+'_higher_th_mean_ind'] = train.training_hours > train[column+'_th_mean']
    test[column+'_higher_th_median_ind'] = test.training_hours > test[column+'_th_median']
    test[column+'_higher_th_mean_ind'] = test.training_hours > test[column+'_th_mean']
    train.drop([column+'_th_mean',column+'_th_median'],axis=1,inplace=True)
    test.drop([column+'_th_mean',column+'_th_median'],axis=1,inplace=True)
    return train,test

def count_features(column,train=train,test=test):
    gp_count = train.groupby(column)['city_development_index'].count().reset_index().rename(columns={'city_development_index':column+'_count'})
    train = train.merge(gp_count,on=column,how='left')
    test = test.merge(gp_count,on=column,how='left')
    return train,test

def cdi_features(column,train=train,test=test):
    gp_median = train.groupby(column)['city_development_index'].median().reset_index().rename(columns={'city_development_index':column+'_cdi_median'})
    gp_mean = train.groupby(column)['city_development_index'].mean().reset_index().rename(columns={'city_development_index':column+'_cdi_mean'})
    gp_sd = train.groupby(column)['city_development_index'].std().reset_index().rename(columns={'city_development_index':column+'_cdi_sd'})
    gp_skew = train.groupby(column)['city_development_index'].skew().reset_index().rename(columns={'city_development_index':column+'_cdi_skew'})
    gp_kurtosis = train.groupby(column)['city_development_index'].apply(lambda x:x.kurtosis()).reset_index().rename(columns={'city_development_index':column+'_cdi_kurtosis'})
    train = train.merge(gp_median,on=column,how='left')
    train = train.merge(gp_mean,on=column,how='left')
    train = train.merge(gp_sd,on=column,how='left')
    train = train.merge(gp_skew,on=column,how='left')
    train = train.merge(gp_kurtosis,on=column,how='left')
    test = test.merge(gp_median,on=column,how='left')
    test = test.merge(gp_mean,on=column,how='left')
    test = test.merge(gp_sd,on=column,how='left')
    test = test.merge(gp_skew,on=column,how='left')
    test = test.merge(gp_kurtosis,on=column,how='left')
    train[column+'_higher_cdi_median_ind'] = train.city_development_index > train[column+'_cdi_median']
    test[column+'_higher_cdi_median_ind'] = test.city_development_index > test[column+'_cdi_median']
    train[column+'_higher_cdi_mean_ind'] = train.city_development_index > train[column+'_cdi_mean']
    test[column+'_higher_cdi_mean_ind'] = test.city_development_index > test[column+'_cdi_mean']
    return train,test

def mean_features(column,train=train,test=test):
    gp_mean = train.groupby(column)['target'].mean().reset_index().rename(columns={'target':column+'_mean'})
    train = train.merge(gp_mean,on=column,how='left')
    test = test.merge(gp_mean,on=column,how='left')
    return train,test

def preferred_features(column,train=train,test=test):
    preferred = train.groupby('city')[column].apply(lambda x: x.mode()).reset_index().rename(columns={column:'preferred_'+column})
    preferred = preferred[preferred.level_1 == 0].drop(['level_1'],axis=1)
    train = train.merge(preferred,on='city',how='left')
    test = test.merge(preferred,on='city',how='left')
    return train,test

categorical_columns = ['city','gender','relevent_experience','enrolled_university','education_level','major_discipline','experience','company_size','company_type','last_new_job']
new_categorical_columns = []
for column in categorical_columns:
    print('Creating features from ' + column)
    train,test = th_features(column,train,test)
    train,test = count_features(column,train,test)
    train,test = mean_features(column,train,test)
    new_categorical_columns.append(column+'_higher_th_median_ind')
    new_categorical_columns.append(column+'_higher_th_mean_ind')
    if column != 'city':
        train,test = cdi_features(column,train,test)
        train,test = preferred_features(column,train,test)
        new_categorical_columns.append(column+'_higher_cdi_median_ind')
        new_categorical_columns.append(column+'_higher_cdi_mean_ind')
        new_categorical_columns.append('preferred_'+column)

Creating features from city
Creating features from gender
Creating features from relevent_experience
Creating features from enrolled_university
Creating features from education_level
Creating features from major_discipline
Creating features from experience
Creating features from company_size
Creating features from company_type
Creating features from last_new_job


In [6]:
train.fillna(0,inplace=True)
test.fillna(0,inplace=True)
categorical_columns = categorical_columns + new_categorical_columns
for col in categorical_columns:
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')
    print('Transforming ' + col)
    encoder = LabelEncoder()
    encoder.fit(train[col].append(test[col]).astype(str))
    train[col] = encoder.transform(train[col].astype(str))
    test[col] = encoder.transform(test[col].astype(str))
predictor_columns = list(train.columns.values)
predictor_columns.remove('enrollee_id')
predictor_columns.remove('target')
target_columns = 'target'

Transforming city
Transforming gender
Transforming relevent_experience
Transforming enrolled_university
Transforming education_level
Transforming major_discipline
Transforming experience
Transforming company_size
Transforming company_type
Transforming last_new_job
Transforming city_higher_th_median_ind
Transforming city_higher_th_mean_ind
Transforming gender_higher_th_median_ind
Transforming gender_higher_th_mean_ind
Transforming gender_higher_cdi_median_ind
Transforming gender_higher_cdi_mean_ind
Transforming preferred_gender
Transforming relevent_experience_higher_th_median_ind
Transforming relevent_experience_higher_th_mean_ind
Transforming relevent_experience_higher_cdi_median_ind
Transforming relevent_experience_higher_cdi_mean_ind
Transforming preferred_relevent_experience
Transforming enrolled_university_higher_th_median_ind
Transforming enrolled_university_higher_th_mean_ind
Transforming enrolled_university_higher_cdi_median_ind
Transforming enrolled_university_higher_cdi_mean_

In [17]:
rounds = 5000
early_stop_rounds = 200
lgbm_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric' : 'auc',
    'num_leaves' : 31,
    'max_depth': 6,
    'learning_rate' : 0.01,
    'feature_fraction' : 0.25,
    'bagging_fraction' : 0.6,
    'bagging_freq' : 20,
    'verbosity' : 0,
    'num_threads' : 8,
    'min_data_in_leaf' : 15,
    'lambda_l1' : 1.2,
    'lambda_l2' : 0.9,
    'cat_smooth' : 100,
    'max_bin' : 25,
    'min_gain_to_split' : 0.005,
    'max_cat_to_onehot' : 200,
    'scale_pos_weight' : 6
}

num_folds = 5
kf = KFold(n_splits=num_folds,shuffle=True,random_state=37)
kf.get_n_splits(train)

5

In [18]:
import collections as cl
results_train = []
results_valid = []
feature_importance = dict()
for i, (train_index, test_index) in zip(range(1,num_folds+1),kf.split(train)):
    X_train = train.loc[train_index,predictor_columns].values
    X_test = train.loc[test_index,predictor_columns].values
    y_train = train.loc[train_index,target_columns].values
    y_test = train.loc[test_index,target_columns].values
    X_train = lgb.Dataset(X_train,y_train,feature_name=predictor_columns,categorical_feature = categorical_columns)
    X_test = lgb.Dataset(X_test,y_test,feature_name=predictor_columns,categorical_feature = categorical_columns)
    gc.collect()
    print('Starting training on fold:',i)
    model = lgb.train(lgbm_params,X_train,num_boost_round=rounds,valid_sets=[X_train,X_test],valid_names=['train','valid'],
                    early_stopping_rounds=early_stop_rounds,verbose_eval=0,)
    results_train.append(model.best_score['train']['auc'])
    results_valid.append(model.best_score['valid']['auc'])
    test['target_' + str(i)] = model.predict(test.loc[:,predictor_columns].values)
    total_gain = np.sum(model.feature_importance('gain'))
    if i == 1:
        for feature, importance in zip(model.feature_name(),model.feature_importance('gain')):
            feature_importance.update({feature : importance/total_gain})
    else:
        for feature, importance in zip(model.feature_name(),model.feature_importance('gain')):
            feature_importance[feature] = feature_importance[feature] + (importance/total_gain)
for feature,importance in zip(feature_importance.keys(),feature_importance.values()):
    feature_importance[feature] = 100*importance/num_folds
feature_importance = cl.OrderedDict(sorted(feature_importance.items(), key=lambda t: t[1], reverse=True))
print('Train:' + str(np.mean(results_train)) + '+-' + str(np.std(results_train)), 'Valid:' + str(np.mean(results_valid)) + '+-' + str(np.std(results_valid)))

Starting training on fold: 1




Starting training on fold: 2




Starting training on fold: 3




Starting training on fold: 4




Starting training on fold: 5




Train:0.7363513341943972+-0.012442435205885505 Valid:0.6781720831835331+-0.004280347829272982


In [10]:
test['target'] = 0
for i in range(1,num_folds+1):
    test['target_'+str(i)] = test['target_'+str(i)].rank(pct=True)
    test['target'] = test['target'] + test['target_'+str(i)]
test['target'] = test['target']/float(num_folds)
test[['enrollee_id','target']].to_csv('sub_sam_lgb.csv',index=False)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
num_folds = 5
kf = KFold(n_splits=num_folds,shuffle=True,random_state=37)
kf.get_n_splits(train)
X = pd.get_dummies(columns=categorical_columns,data=train[predictor_columns],drop_first=True)
X_t = pd.get_dummies(columns=categorical_columns,data=test[predictor_columns],drop_first=True)
results_train = []
results_valid = []
for i, (train_index, test_index) in zip(range(1,num_folds+1),kf.split(train)):
    X_train = X.loc[train_index,:].values
    X_test = X.loc[test_index,:].values
    y_train = train.loc[train_index,target_columns].values
    y_test = train.loc[test_index,target_columns].values
    gc.collect()
    print('Starting training on fold:',i)
    model = RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=8, min_samples_split=10, 
                                   min_samples_leaf=5,max_features=25,n_jobs=-1,random_state=37,bootstrap=True)
    model.fit(X=X_train,y=y_train)
    results_train.append(roc_auc_score(y_train,model.predict_proba(X_train)[:,1]))
    results_valid.append(roc_auc_score(y_test,model.predict_proba(X_test)[:,1]))
    test['target_rf_' + str(i)] = model.predict_proba(X_t.values)[:,1]
print('Train:' + str(np.mean(results_train)) + '+-' + str(np.std(results_train)), 'Valid:' + str(np.mean(results_valid)) + '+-' + str(np.std(results_valid)))

Starting training on fold: 1
Starting training on fold: 2
Starting training on fold: 3
Starting training on fold: 4
Starting training on fold: 5
Train:0.7831937275191583+-0.0007555438966608904 Valid:0.6755802231334923+-0.007591761707823492


In [19]:
test['target'] = 0
for i in range(1,num_folds+1):
    test['target_rf'+str(i)] = test['target_rf_'+str(i)].rank(pct=True)
    test['target'] = test['target'] + test['target_rf_'+str(i)]
test['target'] = test['target']/float(num_folds)
test[['enrollee_id','target']].to_csv('sub_sam_rf.csv',index=False)

In [21]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
num_folds = 5
kf = KFold(n_splits=num_folds,shuffle=True,random_state=37)
kf.get_n_splits(train)
X = pd.get_dummies(columns=categorical_columns,data=train[predictor_columns],drop_first=True)
X_t = pd.get_dummies(columns=categorical_columns,data=test[predictor_columns],drop_first=True)
results_train = []
results_valid = []
for i, (train_index, test_index) in zip(range(1,num_folds+1),kf.split(train)):
    X_train = X.loc[train_index,:].values
    X_test = X.loc[test_index,:].values
    y_train = train.loc[train_index,target_columns].values
    y_test = train.loc[test_index,target_columns].values
    gc.collect()
    print('Starting training on fold:',i)
    model = ExtraTreesClassifier(n_estimators=3000, criterion='entropy', max_depth=8, min_samples_split=50, 
                                 min_samples_leaf=15,max_features=300,min_impurity_decrease=0.0, 
                                 bootstrap=True, n_jobs=-1, random_state=37)
    model.fit(X=X_train,y=y_train)
    results_train.append(roc_auc_score(y_train,model.predict_proba(X_train)[:,1]))
    results_valid.append(roc_auc_score(y_test,model.predict_proba(X_test)[:,1]))
    test['target_et_' + str(i)] = model.predict_proba(X_t.values)[:,1]
print('Train:' + str(np.mean(results_train)) + '+-' + str(np.std(results_train)), 'Valid:' + str(np.mean(results_valid)) + '+-' + str(np.std(results_valid)))

Starting training on fold: 1
Starting training on fold: 2
Starting training on fold: 3
Starting training on fold: 4
Starting training on fold: 5
Train:0.7581600538234055+-0.0020969330747042402 Valid:0.6746285776006717+-0.00573772755124784


In [22]:
test['target'] = 0
for i in range(1,num_folds+1):
    test['target_et_'+str(i)] = test['target_et_'+str(i)].rank(pct=True)
    test['target'] = test['target'] + test['target_et_'+str(i)]
test['target'] = test['target']/float(num_folds)
test[['enrollee_id','target']].to_csv('sub_sam_et.csv',index=False)

In [23]:
#Ensemble
sub1 = pd.read_csv('sub_sam_lgb.csv') #0.6892
#sub2 = pd.read_csv('sub_sam_xgb.csv') #0.6871
sub3 = pd.read_csv('sub_sam_rf.csv') #0.6871
sub4 = pd.read_csv('sub_sam_et.csv') #0.6878
#sub5 = pd.read_csv('sub_sam_cb.csv') #0.6832
pd.concat([sub1.target,sub3.target,sub4.target],axis=1).corr()

Unnamed: 0,target,target.1,target.2
target,1.0,0.905757,0.974945
target,0.905757,1.0,0.902067
target,0.974945,0.902067,1.0


In [24]:
test['target'] = (0.5*sub1.target + 0.35*sub3.target + 0.15*sub4.target)
test[['enrollee_id','target']].to_csv('sub_sam_lgb_rf_et_ensemble.csv',index=False)