In [9]:
''' This test script builds and runs three predictive models. The model with the lowest MSE score is
    used to show the most important features'''

__author__ = 'skyler bullard'

__copyright__ = 'Copyright 2018, Data Science Dream Job LLC'

In [10]:
#import everything we might need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

In [11]:
path = 'C:/Users/sprtn/jupyter_project_files/salarypredictionportfolio/DSDJ-data-SalaryPredictions/data/'

test_features = pd.read_csv( open(path + 'test_features.csv') )
train_features = pd.read_csv( open(path + 'train_features.csv') )
train_salaries = pd.read_csv( open(path + 'train_salaries.csv') )

In [12]:
#inner join to return records present only in both dataframes
def consolidate_data ( df1 , df2 , key = None , left_index = False , right_index = False ):
    return pd.merge( left = df1 , right = df2 , how = 'inner' , on = key ,
                   left_index = left_index , right_index = right_index )

#remove rows containing salary <= 0 or duplicate job IDs
def clean_data( raw_df ):
    clean_df = raw_df.drop_duplicates(subset = 'jobId' )
    clean_df = clean_df[ clean_df.salary > 0 ]
    return clean_df

#one-hot encoding on all categorical variables and combines result with continuous variables
def one_hot_encode_feature_df( df , cat_vars = None, num_vars = None ):
    cat_df = pd.get_dummies( df[ cat_vars ] )
    num_df = df[ num_vars ].apply( pd.to_numeric ) 
    return pd.concat( [ cat_df , num_df ] , axis = 1 )

#return target dataframe
def get_target_df( df , target ):
    return df[ target ]

In [13]:
#define feature variables
categorical_vars = [ 'companyId' , 'jobType' , 'degree' , 'major' , 'industry' ]
numerical_vars = [ 'yearsExperience' , 'milesFromMetropolis' ]
target_var = 'salary'

In [14]:
#consolidated training data
raw_training_df = consolidate_data( train_features , train_salaries , key = 'jobId' )

In [15]:
#clean, shuffle, re-index to improve cross-validation accuracy
clean_training_df = shuffle( clean_data( raw_training_df ) ).reset_index()