# Data Science Job Change Predictions

This notebook is building on the EDA carried out in [here](https://www.kaggle.com/stuartday274/job-change-data-exploration-and-predictions).

This is my first attempt at creating a full pipeline that carries out the data prep, feature engineering and learning. In order to keep track of the column names i have created my own transformations rather than use the sklearn built in ones. Would be interested in feedback on better ways to have done this.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
df.head()

# Prepare Data

# Preparing Train Test Data

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X = df
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

# Using a Pipeline and a gridsearch to get the most out of a random forest classifier.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from catboost import CatBoostRegressor

In [None]:
# based on eta we found experience above 10 was a good indicator
def experience_processor(X):
    experience = X['experience']
    experience = np.where(
        experience.str.contains('>20'),
        21,
        np.where(
            experience.str.contains('<1'),
            0,
            experience
        )
    ).astype('int')
    X['experience'] = np.where(
        experience < 10,
        0,
        1
    )
    return X

In [None]:
# build new column that is experience is null but has lots of experience
def experience_in_unknown(X):
    X = X.copy()
    experience = X['experience']
    company_size = X['company_size']
    X['experience_unknown'] = np.where(
        (experience == 1) & (company_size == 'missing_value'),
        1,
        0
    )
    return X


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class BaseFeatureEngineer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns = []):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        return X

In [None]:
# Transformations for the numeric features
class NumericImputer(BaseFeatureEngineer):
    
    def fit(self, X, y=None):
        self.means = { col: X[col].mean() for col in self.columns}
        return self
        
    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna(self.means[col])
        return X

class NumericScaler(BaseFeatureEngineer):       
    
    def fit(self, X, y=None):
        self.scaler = StandardScaler()
        self.scaler.fit(X[self.columns])
        return self
        
    def transform(self, X):
        X = X.copy()
        X[self.columns] = self.scaler.transform(X[self.columns])
        return X    

numeric_features = ['city_development_index', 'training_hours']
    

numeric_preprocessing = Pipeline([
    ('imputer', NumericImputer(numeric_features)),
    ('scaler', NumericScaler(numeric_features))
])

In [None]:
# Transformations for categorical data
class CategoricalImputer(BaseFeatureEngineer):
        
    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna('missing_value')
        return X

categorical_features= [
    'gender', 'relevent_experience', 'enrolled_university',
    'education_level', 'major_discipline',
    'company_size', 'company_type', 'last_new_job',
]
    

categorical_preprocessing = Pipeline([
    ('imputer',CategoricalImputer([categorical_features]))
])

In [None]:
class FeatureEngineering(BaseFeatureEngineer):
    
    def transform(self, X):
        X = X.copy()
        X = experience_processor(X)
        X = experience_in_unknown(X)
        return X

create_new_features = Pipeline([
    ('feature_engineering', FeatureEngineering())
])

In [None]:
class SelectColumns(BaseFeatureEngineer):
    def transform(self, X):
        X = X.copy()
        return X[self.columns]

In [None]:
preprocessor = Pipeline([
    ('numeric_preprocessing', numeric_preprocessing),
    ('categorical_preprocessing', categorical_preprocessing),
    ('create_new_features', create_new_features),
    ('select_colums', SelectColumns(numeric_features + categorical_features + ['experience_unknown'])),
    ('one_hot_encoding', ColumnTransformer([
        ('one_hot_encoding', OneHotEncoder(
            handle_unknown='error', drop='first', sparse=False),
             categorical_features 
        )
    ], remainder='passthrough'))
])

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBClassifier(random_state=42)),
])


In [None]:
pipeline.fit(X_train, y_train)

In [None]:
pipeline.predict(X_test)

In [None]:
pipeline.predict_proba(X_test)[:,1]

In [None]:
test_prediction = pipeline.predict_proba(X_test)[:,1]
# test_prediction = np.where(test_prediction < 0.5, 0,1)
score = roc_auc_score(y_test, test_prediction)

print(f'Area under ROC Score of Random Forest Model On Test Set - {score:,.2%}')


In [None]:
test_prediction

# Tuning Parameters with a GridSearch

In [None]:
parameters = {
    'xgb__n_estimators': [10*x for x in range(4,10)],
    'xgb__max_depth': [i for i in range(1,6)]
}
grid = GridSearchCV(pipeline, param_grid=parameters, cv=5, scoring='roc_auc')


In [None]:
grid.fit(X_train, y_train)

In [None]:
print(f'score = {grid.score(X_test,y_test):0,.2%}')
print(f'Best parameters: {grid.best_params_}')

In [None]:
grid_search_roc_score = roc_auc_score(y_test, grid.predict_proba(X_test)[:,1])

print(f'Area under ROC Score of XGBClassifier On Test Set - {grid_search_roc_score:,.2%}')


# Preparing a test submission

In [None]:
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
test['target'] = grid.predict_proba(test)[:,1]

test[['enrollee_id', 'target']]

In [None]:
test[['enrollee_id', 'target']].to_csv('submit.csv',index= False)