In [None]:
import numpy as np
import pandas as pd
import datetime
import re
from itertools import chain
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import matplotlib.pyplot as plt

from __future__ import _function

In [None]:
# Preprocessing data and helpers
number = re.compile('[\d,]+')
def get_first_number(val):
    matched = number.match(str(val))
    if matched:
        return int(matched.group().replace(',', ''))
    else:
        return np.nan

def dict_map(dict_to_use):
    def mapper(val):
        if val in dict_to_use:
            return dict_to_use[val]
        else:
            return np.nan

    return mapper

def split_list(list_str):
    return np.core.defchararray.split(list_str, sep='; ')

def userlist_to_cols(col):
    list_elems = col.astype('string').apply(split_list)
    categories = set(chain.from_iterable(list_elems.values))
    categories.discard('nan')
    categories = pd.Series(list(categories))
    
    def category_to_cols(category):
        return list_elems.map(lambda user_resp: category in user_resp)\
                         .rename('%s_%s' % (col.name, category))\
                         .astype('int8')
    
    new_cols = categories.map(category_to_cols)
    return pd.concat(new_cols.values, axis=1)

def timestr_to_number(timestr):
    if timestr == 'Noon':
        return 12
    elif timestr == 'Midnight':
        return 0
    else:
        return datetime.datetime.strptime(timestr, '%I:%M %p').hour

listvals = [
    'DeveloperType',
    'NonDeveloperType',
    'ImportantBenefits',
    'JobProfile',
    'EducationTypes',
    'SelfTaughtTypes',
    'CousinEducation',
    'HaveWorkedLanguage',
    'WantWorkLanguage',
    'HaveWorkedFramework',
    'WantWorkFramework',
    'HaveWorkedDatabase',
    'WantWorkDatabase',
    'HaveWorkedPlatform',
    'WantWorkPlatform',
    'IDE',
    'Methodology',
    'MetricAssess',
    'StackOverflowDevices',
    'Race',
]

number_parses = [
    'CompanySize',
    'YearsProgram',
    'YearsCodedJob',
    'YearsCodedJobPast',
]

agree_keys = [
    'ExCoderReturn',
    'ExCoderNotForMe',
    'ExCoderBalance',
    'ExCoder10Years',
    'ExCoderBelonged',
    'ExCoderSkills',
    'ExCoderWillNotCode',
    'ExCoderActive',
    'ProblemSolving',
    'BuildingThings',
    'LearningNewTech',
    'BoringDetails',
    'JobSecurity',
    'DiversityImportant',
    'AnnoyingUI',
    'FriendsDevelopers',
    'RightWrongWay',
    'UnderstandComputers',
    'SeriousWork',
    'InvestTimeTools',
    'WorkPayCare',
    'KinshipDevelopers',
    'ChallengeMyself',
    'CompetePeers',
    'ChangeWorld',
    'ShipIt',
    'OtherPeoplesCode',
    'ProjectManagement',
    'EnjoyDebugging',
    'InTheZone',
    'CollaborateRemote',
    'StackOverflowAdsRelevant',
    'StackOverflowAdsDistracting',
    'StackOverflowModeration',
    'StackOverflowCommunity',
    'StackOverflowHelpful',
    'StackOverflowBetter',
    'StackOverflowWhatDo',
    'StackOverflowMakeMoney',
    'SurveyLong',
    'QuestionsInteresting',
    'QuestionsConfusing',
    'InterestedAnswers',
]

important_keys = [
    'AssessJobRole',
    'AssessJobExp',
    'AssessJobDept',
    'AssessJobTech',
    'AssessJobProjects',
    'AssessJobCompensation',
    'AssessJobOffice',
    'AssessJobCommute',
    'AssessJobRemote',
    'AssessJobLeaders',
    'AssessJobProfDevel',
    'AssessJobDiversity',
    'AssessJobProduct',
    'AssessJobFinances',
    'ImportantHiringAlgorithms',
    'ImportantHiringTechExp',
    'ImportantHiringCommunication',
    'ImportantHiringOpenSource',
    'ImportantHiringPMExp',
    'ImportantHiringCompanies',
    'ImportantHiringTitles',
    'ImportantHiringEducation',
    'ImportantHiringRep',
    'ImportantHiringGettingThingsDone',
    'EducationImportant',
]

satisfied_keys = [
    'EquipmentSatisfiedMonitors',
    'EquipmentSatisfiedCPU',
    'EquipmentSatisfiedRAM',
    'EquipmentSatisfiedStorage',
    'EquipmentSatisfiedRW',
    'InfluenceInternet',
]

influence_keys = [
    'InfluenceWorkstation',
    'InfluenceHardware',
]

yes_no_keys = [
    'ClickyKeys',
]

last_three_months_keys = [
    'StackOverflowCopiedCode',
    'StackOverflowJobListing',
    'StackOverflowCompanyPage',
    'StackOverflowJobSearch',
    'StackOverflowNewQuestion',
    'StackOverflowAnswer',
    'StackOverflowMetaChat',
]

last_three_months_strs = {
    'Several times': 5,
    'At least once each day': 4,
    'At least once each week': 3,
    'Once or twice': 2,
    "Haven't done at all": 1,
}

yes_no_strs = {
    'Yes': 1,
    'No': 0,
}

influence_strs = {
    'I am the final decision maker': 5,
    'A lot of influence': 4,
    'Some influence': 3,
    'Not much influence': 2,
    'No influence at all': 1,
}

agree_strs = {
    'Strongly Agree': 5,
    'Agree': 4,
    'Somewhat agree': 3,
    'Somewhat disagree': 2,
    'Disagree': 1,
    'Strongly disagree': 0,
}

satisfied_strs = {
    'Very satisfied': 5,
    'Satisfied': 4,
    'Somewhat satisfied': 3,
    'Not very satisfied': 2,
    'Not at all satisfied': 1,
}

important_strs = {
    'Very important': 5,
    'Somewhat important': 4,
    'Important': 3,
    'Not very important': 2,
    'Not at all important': 1,
}

overpaid_strs = {
    'Greatly overpaid': 5,
    'Somewhat overpaid': 4,
    'Neither underpaid nor overpaid': 3,
    'Somewhat underpaid': 2,
    'Greatly underpaid': 1,
}

checkin_strs = {
    'Never': 5,
    'Just a few times over the year': 4,
    'A few times a month': 3,
    'A few times a week': 2,
    'Once a day': 1,
    'Multiple times a day': 0,
}

to_drop = listvals + [
    'Gender',
    'JobSeekingStatus',
]


replacers = [
    (influence_keys, influence_strs),
    (agree_keys, agree_strs),
    (satisfied_keys, satisfied_strs),
    (['CheckInCode'], checkin_strs),
    (['Overpaid'], overpaid_strs),
    (last_three_months_keys, last_three_months_strs),
    (important_keys, important_strs),
    (yes_no_keys, yes_no_strs),
]

In [None]:
# 2017 preproc
data = pd.read_csv('data/2017.csv')
data = data[data['JobSatisfaction'].notnull()]

data['gender_M'] = (data['Gender'] == 'Male').astype('int8')
data['gender_F'] = (data['Gender'] == 'Female').astype('int8')

for keys, strs in replacers:
    data[keys] = data[keys].applymap(dict_map(strs)).astype('float')

for index in listvals:
    data = pd.concat([data, userlist_to_cols(data[index])], axis=1)

data[number_parses] = data[number_parses].applymap(get_first_number).astype('float')

data.drop(to_drop, axis=1, inplace=True)
data = pd.get_dummies(data)

print('Columns with NaNs:')
for key in data:
    if data[key].isnull().any():
        print(key)
data.fillna(data.mean(), inplace=True)
print('Filling with mean of column')

In [None]:
searcher = GridSearchCV(xgb.XGBRegressor(), {
    'max_depth': [4], # tested 2,3,4,5,7,9
    'min_child_weight': [7], # tested 1,3,5,6,7,8
    'gamma': [0], # tested 0,0.1,0.2,1,2
    'learning_rate': [0.1],
    'n_estimators': [1000],
    'subsample': [0.8], # tested 0.6,0.7,0.8,0.9
    'colsample_bytree': [0.6], # tested 0.5,0.6,0.7,0.8,0.9
    'scale_pos_weight': [1],
    'reg_alpha': [1], # tested 1e-5, 1e-2, 0.1, 1, 10, 100
    'reg_lambda': [100], # tested 1e-5, 1e-2, 0.1, 1, 10, 100
    'learning_rate': [0.01],
}, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=4, iid=False)

searcher.fit(data.drop(['JobSatisfaction'], axis=1), data['JobSatisfaction'])
print(searcher.best_params_)
print(searcher.best_score_)

In [None]:
model = xgb.XGBRegressor()
rfe = RFE(model, 100, step=5, verbose=2)
rfe = rfe.fit(X,y)
# rfe.transform(X_train)
# rfe.transform(X_test)
for feat in np.ma.masked_array(data.columns.values, mask=rfe.support_):
    print(feat)

In [None]:
X = data.drop(['JobSatisfaction'], axis=1)
y = data['JobSatisfaction']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y)

In [None]:
# MSE 2.5054
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
preds = xgb_model.predict(X_test)
mean_squared_error(preds, y_test)

In [None]:
# MSE 2.3870
xgb_model = xgb.XGBRegressor(
    max_depth=4,
    min_child_weight=7,
    gamma=0,
    n_estimators=1500,
    subsample=0.8,
    colsample_bytree=0.6,
    scale_pos_weight=1,
    reg_alpha=1,
    reg_lambda=100,
    learning_rate=0.01
)

xgb_model.fit(X_train, y_train)
preds = xgb_model.predict(X_test)
print(mean_squared_error(preds, y_test))