In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV

In [2]:
sal = pd.read_csv('./data/mergedSalary2.csv')

In [3]:
# eliminate rows with missing Y values (NaN)
sal['missingSalary'] = pd.isnull(sal['salary'])
sal2 = sal[(sal.missingSalary == False)]

In [4]:
# list of X vars to include
X_numeric_features = ['sup1', 'sup2',
                      'sup3', 'sup4', 
                      'sup5', 'yearsinposition',
                      'yearsinprofession', 'age',
                      'inst1', 'inst2', 'inst3', 
                      'inst4', 'inst5', 'instbudget', 
                      'instsize', 'total_population',
                      'median_household_income', 
                      'no_male_hs', 'no_female_hs', 
                      'no_hs', 'at_least_hs_male', 
                      'at_least_hs_female', 'at_least_hs', 
                      'hs_some_college_male', 'hs_some_college_female',
                      'hs_some_college', 'bachelors_male', 
                      'bachelors_female', 'bachelors',
                      'graduate_male', 'graduate_female', 
                      'graduate', 'hispanic', 'white',
                      'black', 'native_american', 'asian_api', 
                      'two_race_or_more', 'male_unemployment', 
                      'female_unemployment', 'renter', 'owner', 
                      'median_rent', 'Sex.by.Age..Male.',
                      'Sex.by.Age..Female.', 'full_time', 
                      'part_time', 'foreign_born', 'US_born', 
                      'married', 'divorced', 'poverty']
X_numeric = sal2[X_numeric_features]

In [5]:
# create dummy variables for each of the categorical features
# DOC: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html

X_categorical_features = ['function', 'gender', 'race', 
                          'highestdegree', 'category', 'insttype',
                          'city', 'state']
X_categorical = sal2[X_categorical_features]

function_dummies = pd.get_dummies(X_categorical['function'])
gender_dummies = pd.get_dummies(X_categorical['gender'])
race_dummies = pd.get_dummies(X_categorical['race'])
highestDegree_dummies = pd.get_dummies(X_categorical['highestdegree'])
category_dummies = pd.get_dummies(X_categorical['category'])
instType_dummies = pd.get_dummies(X_categorical['insttype'])
city_dummies = pd.get_dummies(X_categorical['city'])
state_dummies = pd.get_dummies(X_categorical['state'])

# convert to ndarray
X_dummy_features = pd.concat([function_dummies, gender_dummies, 
                              race_dummies, highestDegree_dummies, 
                              category_dummies, instType_dummies, 
                              city_dummies, state_dummies], axis=1)

In [6]:
# impute missing values in numerical features
# DOC: http://scikit-learn.org/stable/modules/preprocessing.html

from sklearn.preprocessing import Imputer
imp = Imputer()
imp.fit(X_numeric)
X_numeric_imputed = imp.transform(X_numeric)

In [7]:
# concatenate dummy and imputed numeric for X

X = np.concatenate((X_dummy_features, X_numeric_imputed), axis=1)

In [8]:
# y is salary
y = sal2.loc[:, ['salary']].values

In [9]:
# create training and test sets 
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.3, random_state=0)

In [10]:
# prepare a range of alpha values to test
alphas = np.array([10, 1, 0.8, 0.6, 0.1, 0.01, 0])

In [11]:
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

In [12]:
# Train the model using the training sets
X_train_no_intercept = X_train
X_train = X_train.reshape(-1, X_train.shape[1])
grid.fit(X_train, y_train)

# Print and summarize the results of the grid search
print(grid)
print(grid.best_score_)
print(grid.best_estimator_.alpha)


GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([ 10.  ,   1.  ,   0.8 ,   0.6 ,   0.1 ,   0.01,   0.  ])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
0.737924469413
10.0
