In [1]:
# import the libraries
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from keras.optimizers import SGD
from keras.layers import Dropout

Using Theano backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
# 2 hidden layer architecture with Best parameters 
# neurons1 = number of neurons in layer 1 
# neurons2 = number of neurons in layer 2 
def deeper_model(neurons1=25,neurons2=10,dropout=0.0):
	model = Sequential()
	model.add(Dense(neurons1,input_dim=21,kernel_initializer='normal',activation='relu'))
	model.add(Dense(neurons2,kernel_initializer='normal',activation='relu'))
	model.add(Dropout(dropout))
	model.add(Dense(1,kernel_initializer='normal'))

	model.compile(loss='mean_squared_error',optimizer='adam')
	return model



# 1 hidden layer architecture with Best parameters 
# neurons1 = number of neurons in layer 1
def baseline_model(neurons=25):
	model = Sequential()
	model.add(Dense(neurons,input_dim=21,kernel_initializer='normal',activation='relu'))
	model.add(Dense(1,kernel_initializer='normal'))

	model.compile(loss='mean_squared_error',optimizer='adam')
	return model


In [9]:
# Perform Grid Search on the parameters
def do_grid_search(model):
	# Experimenting with the batch_size, epochs, dropout,
	# neurons in layer 1 and neurons in layer 2
	
	batch_size = [32] # best batch size
	epochs = [150] # best number of epochs
	dropout = [0.0]
	neurons1 = [30]
	# neurons1 = [15] # best number of neurons in layer1
	neurons2 = [12] # best is 10 with dropout 0.0
	param_grid_bm = dict(epochs=epochs,batch_size=batch_size,neurons=neurons1)
	# param_grid_dm = dict(epochs=epochs,batch_size=batch_size,neurons1=neurons1,neurons2=neurons2,dropout=dropout)
	grid = GridSearchCV(estimator=model,param_grid=param_grid_bm,verbose=10)
	# grid = GridSearchCV(estimator=model,param_grid=param_grid_dm,verbose=10)
	grid_result = grid.fit(X_train,y_train)

	return (grid,grid_result)



def build_model():
	model = KerasRegressor(build_fn=baseline_model,verbose=0)
	# model = KerasRegressor(build_fn=deeper_model,verbose=0)
	return model

In [4]:
# Splittng the data into training and test
def split_into_training_and_test(X,y):
	X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.50,random_state=42)
	return (X_train,X_test,y_train,y_test)



# Encoding the degree variable
def encode_degree(row):
    if row['degree'] == 'BACHELORS':
        return 0
    elif row['degree'] == 'DOCTORAL':
        return 1
    else:
        return 2

# Encoding the major variable
def encode_major(row):
    if row['major'] == 'NONE':
        return 0
    elif row['major'] == 'BUSINESS':
        return 1
    elif row['major'] == 'ENGINEERING':
        return 2
    else:
        return 3
    
# Encoding the industry variable
def encode_industry(row):
    if row['industry'] == 'HEALTH':
        return 0
    elif row['industry'] == 'WEB':
        return 1
    elif row['industry'] == 'EDUCATION':
        return 2
    elif row['industry'] == 'OIL':
        return 3
    elif row['industry'] == 'FINANCE':
        return 4
    else:
        return 5
    
    
# Encoding the jobType variable
def encode_jobType(row):
    if row['jobType'] == 'JANITOR':
        return 0
    elif row['jobType'] == 'CEO':
        return 1
    elif row['jobType'] == 'CFO':
        return 2
    elif row['jobType'] == 'CTO':
        return 3
    elif row['jobType'] == 'JUNIOR':
        return 4
    else:
        return 5

In [5]:
# Perform pre-procesing of the data.
def preprocessing(df_feat,df_target):
	# Join the features and the target data frames.
	df = df_feat.join(df_target,lsuffix='_feat',rsuffix='_target')
	# Drop redundant column
	df.drop(['jobId_target'],axis=1,inplace=True)
	# Rename the column
	df.columns = [u'jobId', u'companyId', u'jobType', u'degree', u'major',
	       u'industry', u'yearsExperience', u'milesFromMetropolis', u'salary']

	df['degreeEncoded'] = df.apply(encode_degree,axis=1)
	df['majorEncoded'] = df.apply(encode_major,axis=1)
	df['industryEncoded'] = df.apply(encode_industry,axis=1)
	df['jobTypeEncoded'] = df.apply(encode_jobType,axis=1)

	df['degreeEncoded'] = df['degreeEncoded'].astype('category')
	df['majorEncoded'] = df['majorEncoded'].astype('category')
	df['industryEncoded'] = df['industryEncoded'].astype('category')
	df['jobTypeEncoded'] = df['jobTypeEncoded'].astype('category')

	predictor_columns = ['yearsExperience','milesFromMetropolis','degreeEncoded','majorEncoded','industryEncoded',\
                     'jobTypeEncoded']
	target_column = ['salary']
	X_train = df[predictor_columns]
	y_train = df[target_column] 

	X_train = pd.get_dummies(X_train)
	X_train = X_train.values
	y_train = y_train.as_matrix()

	return(X_train,y_train)


# Read the data from the files.
def read_data():
	# read the data into pandas data frames
	df_feat = pd.read_csv('train_features_2013-03-07.csv',sep=',')
	df_target = pd.read_csv('train_salaries_2013-03-07.csv',sep=',')

	return (df_feat,df_target)


In [6]:
seed = 7
np.random.seed(seed)

df_feat,df_target = read_data()
X_train,y_train = preprocessing(df_feat,df_target)
X_train,X_test,y_train,y_test = split_into_training_and_test(X_train,y_train)

In [7]:
model = build_model()
grid,grid_result = do_grid_search(model)

# Predicting and printing the results
y_pred = grid.best_estimator_.predict(X_test)
print "mean_squared_error: ", mean_squared_error(y_test,y_pred)
print "R2 score: ", r2_score(y_test,y_pred)

# Printing the results of the grid search
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] epochs=30, neurons=5, batch_size=32 .............................
[CV]  epochs=30, neurons=5, batch_size=32, score=408.345788, total= 1.4min
[CV] epochs=30, neurons=5, batch_size=32 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s


[CV]  epochs=30, neurons=5, batch_size=32, score=410.354842, total= 1.3min
[CV] epochs=30, neurons=5, batch_size=32 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.8min remaining:    0.0s


[CV]  epochs=30, neurons=5, batch_size=32, score=412.240803, total= 1.3min
[CV] epochs=30, neurons=10, batch_size=32 ............................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.1min remaining:    0.0s


[CV]  epochs=30, neurons=10, batch_size=32, score=407.481593, total= 1.6min
[CV] epochs=30, neurons=10, batch_size=32 ............................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.8min remaining:    0.0s


[CV]  epochs=30, neurons=10, batch_size=32, score=406.458152, total= 1.7min
[CV] epochs=30, neurons=10, batch_size=32 ............................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.5min remaining:    0.0s


[CV]  epochs=30, neurons=10, batch_size=32, score=411.911304, total= 1.4min
[CV] epochs=30, neurons=15, batch_size=32 ............................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  8.9min remaining:    0.0s


[CV]  epochs=30, neurons=15, batch_size=32, score=395.643517, total= 1.6min
[CV] epochs=30, neurons=15, batch_size=32 ............................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 10.5min remaining:    0.0s


[CV]  epochs=30, neurons=15, batch_size=32, score=397.988823, total= 1.7min
[CV] epochs=30, neurons=15, batch_size=32 ............................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 12.2min remaining:    0.0s


[CV]  epochs=30, neurons=15, batch_size=32, score=407.783456, total= 1.7min
[CV] epochs=30, neurons=20, batch_size=32 ............................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 13.9min remaining:    0.0s


[CV]  epochs=30, neurons=20, batch_size=32, score=406.458368, total= 1.4min
[CV] epochs=30, neurons=20, batch_size=32 ............................
[CV]  epochs=30, neurons=20, batch_size=32, score=401.453792, total= 1.4min
[CV] epochs=30, neurons=20, batch_size=32 ............................
[CV]  epochs=30, neurons=20, batch_size=32, score=403.876710, total= 1.4min
[CV] epochs=30, neurons=25, batch_size=32 ............................
[CV]  epochs=30, neurons=25, batch_size=32, score=397.514073, total= 1.5min
[CV] epochs=30, neurons=25, batch_size=32 ............................
[CV]  epochs=30, neurons=25, batch_size=32, score=403.190828, total= 1.4min
[CV] epochs=30, neurons=25, batch_size=32 ............................
[CV]  epochs=30, neurons=25, batch_size=32, score=398.808921, total= 1.5min
[CV] epochs=30, neurons=30, batch_size=32 ............................
[CV]  epochs=30, neurons=30, batch_size=32, score=394.779080, total= 1.5min
[CV] epochs=30, neurons=30, batch_size=32 

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 63.0min finished


mean_squared_error:  409.847080818
R2 score:  0.726676557836
Best: 414.702965 using {'epochs': 30, 'neurons': 5, 'batch_size': 128}
410.313807 (1.590397) with: {'epochs': 30, 'neurons': 5, 'batch_size': 32}
408.617009 (2.366586) with: {'epochs': 30, 'neurons': 10, 'batch_size': 32}
400.471917 (5.257935) with: {'epochs': 30, 'neurons': 15, 'batch_size': 32}
403.929624 (2.043454) with: {'epochs': 30, 'neurons': 20, 'batch_size': 32}
399.837943 (2.429068) with: {'epochs': 30, 'neurons': 25, 'batch_size': 32}
396.410768 (1.334987) with: {'epochs': 30, 'neurons': 30, 'batch_size': 32}
414.103488 (4.905018) with: {'epochs': 30, 'neurons': 5, 'batch_size': 64}
408.818709 (2.548248) with: {'epochs': 30, 'neurons': 10, 'batch_size': 64}
404.504309 (1.113486) with: {'epochs': 30, 'neurons': 15, 'batch_size': 64}
406.862002 (5.372039) with: {'epochs': 30, 'neurons': 20, 'batch_size': 64}
403.452717 (2.532060) with: {'epochs': 30, 'neurons': 25, 'batch_size': 64}
401.437284 (1.425340) with: {'epoc

In [10]:
model = build_model()
grid,grid_result = do_grid_search(model)

# Predicting and printing the results
y_pred = grid.best_estimator_.predict(X_test)
print "mean_squared_error: ", mean_squared_error(y_test,y_pred)
print "R2 score: ", r2_score(y_test,y_pred)

# Printing the results of the grid search
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] epochs=150, neurons=30, batch_size=32 ...........................
[CV]  epochs=150, neurons=30, batch_size=32, score=394.032588, total= 7.8min
[CV] epochs=150, neurons=30, batch_size=32 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.8min remaining:    0.0s


[CV]  epochs=150, neurons=30, batch_size=32, score=395.736921, total= 8.0min
[CV] epochs=150, neurons=30, batch_size=32 ...........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 15.9min remaining:    0.0s


[CV]  epochs=150, neurons=30, batch_size=32, score=396.787088, total= 8.0min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 23.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 23.9min finished


mean_squared_error:  395.315518792
R2 score:  0.73636752976
Best: 395.518863 using {'epochs': 150, 'neurons': 30, 'batch_size': 32}
395.518863 (1.135041) with: {'epochs': 150, 'neurons': 30, 'batch_size': 32}
