# Salary Predictions Based on Job Descriptions

# Part 1 - DEFINE

### ---- 1 Define the problem ----

Write the problem in your own words here

In [None]:
#import your libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
% matplotlib inline


## Part 2 - DISCOVER

### ---- 2 Load the data ----

In [None]:
train_feature_df=pd.read_csv('data/train_features.csv')
train_target_df=pd.read.csv('data/train_salaries.csv')
test_feature_df=pd.read.csv('data/test_features.csv')

### ---- 3 Examine the Data (EDA) ----

#Examine first 10 rows in the data frame
train_feature_df.head(10)
train_target_df.head(10)
test_feature_df.head(10)


#Check the length and types of the variables
train_feature_df.info()
train_target_df.info()
test_feature_df.info()

#Check for Duplicates
train_feature_df.duplicated().sum()
train_target_df.duplicated().sum()
test_feature_df.duplicated().sum()

### ---- 4 Explore the data (EDA) ----

In [4]:
#summarize each feature variable
#summarize the target variable
#look for correlation between each feature and the target
#look for correlation between features

#Identify numerical and categorical variables 
train_feature_df.columns
categorical_cols=['jobId', 'companyId', 'jobType', 'degree', 'major', 'industry']
numeric_cols=['yearsExperience', 'milesfromMetropolis']

#Summarize numerical and categorical variables separately 
train_feature_df.describe(include=[np.number])
train_feature_df.describe(include=['0'])

#Merge Independent (feature) and Dependent (target) variables into single dataframe (df)
train_df= pd.merge(train_feature_df, train_target_df, on='jobId')
train_df.info()
train_df.head()

#Visualize target variable (salary)
plt.figure(figsize= (14,6))
plt.subplot(1,2,1)
sns.boxplot(train_df.salary)
plt.subplot(1,2,2)
sns.distplot(train_df.salary, bins=20)
plt.show()

#Use IQR to identify potential outliers
stat = train_df.salary.describe()
print(stat)
IQR = stat['75%'] - stat['25%']
upper = stat['75%'] + 1.5 * IQR
lower = stat['25%'] - 1.5 * IQR
print('The upper and lower bounds for suspected outliers are {} and {}.'.format(upper, lower))

#Examine Potential Outliers
#check potential outlier below lower bound
train_df[train_df.salary < 8.5]


#check potential outlier above upper bound
train_df.loc[train_df.salary > 222.5, 'jobType'].value_counts()

# Check most suspicious potential outliers above upper bound
train_df[(train_df.salary > 222.5) & (train_df.jobType == 'JUNIOR')]

# Remove data with zero salaries
train_df = train_df[train_df.salary > 8.5]

def plot_feature(df, col):
    '''
    Make plot for each features
    left, the distribution of samples on the feature
    right, the dependance of salary on the feature
    '''
    plt.figure(figsize = (14, 6))
    plt.subplot(1, 2, 1)
    if df[col].dtype == 'int64':
        df[col].value_counts().sort_index().plot()
    else:
        #change the categorical variable to category type and order their level by the mean salary
        #in each category
        mean = df.groupby(col)['salary'].mean()
        df[col] = df[col].astype('category')
        levels = mean.sort_values().index.tolist()
        df[col].cat.reorder_categories(levels, inplace=True)
        df[col].value_counts().plot()
    plt.xticks(rotation=45)
    plt.xlabel(col)
    plt.ylabel('Counts')
    plt.subplot(1, 2, 2)

    if df[col].dtype == 'int64' or col == 'companyId':
        #plot the mean salary for each category and fill between the (mean - std, mean + std)
        mean = df.groupby(col)['salary'].mean()
        std = df.groupby(col)['salary'].std()
        mean.plot()
        plt.fill_between(range(len(std.index)), mean.values-std.values, mean.values + std.values, \
                         alpha = 0.1)
    else:
        sns.boxplot(x = col, y = 'salary', data=df)
    
    plt.xticks(rotation=45)
    plt.ylabel('Salaries')
    plt.show()
    
    #Relation between companies and salary
    plot_feature(train_df, 'companyId')
    
    #Relation between job type and salary 
    plot_feature(train_df, 'jobType')
    
    #Relation between major and salary
    plot_feature(train_df, 'major')
    
    #Relation between industry and salary
    plot_feature(train_df, 'industry')
    
    #Relation between years of experience and salary
    plot_feature(train_df, 'industry')
    
    #Relation between metropolis distance and salary
    plot_feature(train_df, 'milesFromMetropolis')
    
    

NameError: name 'train_feature_df' is not defined

### ---- 5 Establish a baseline ----

In [5]:
def encode_label(df, col):
    #encode the categories using average salary for each category to replace label
    cat_dict ={}
    cats = df[col].cat.categories.tolist()
    for cat in cats:
        cat_dict[cat] = train_df[train_df[col] == cat]['salary'].mean()   
    df[col] = df[col].map(cat_dict)

for col in train_df.columns:
    if train_df[col].dtype.name == "category":
        encode_label(train_df, col)

#Correlations between selected features and response
# jobId is discarded because it is unique for individual
fig = plt.figure(figsize=(12, 10))
features = ['companyId', 'jobType', 'degree', 'major', 'industry', 'yearsExperience', 'milesFromMetropolis']
sns.heatmap(train_df[features + ['salary']].corr(), cmap='Blues', annot=True)
plt.xticks(rotation=45)
plt.show()

### ---- 6 Hypothesize solution ----

In [None]:
#brainstorm 3 models that you think may improve results over the baseline model based
#on your 

Brainstorm 3 models that you think may improve results over the baseline model based on your EDA and explain why they're reasonable solutions here.

Also write down any new features that you think you should try adding to the model based on your EDA, e.g. interaction variables, summary statistics for each group, etc

## Part 3 - DEVELOP

You will cycle through creating features, tuning models, and training/validing models (steps 7-9) until you've reached your efficacy goal

#### Your metric will be MSE and your goal is:
 - <360 for entry-level data science roles
 - <320 for senior data science roles

### ---- 7 Engineer features  ----

In [None]:
#make sure that data is ready for modeling
#create any new features needed to potentially enhance model

### ---- 8 Create models ----

In [15]:
#create and tune the models that you brainstormed during part 2

### ---- 9 Test models ----

In [1]:
#do 5-fold cross validation on models and measure MSE

### ---- 10 Select best model  ----

In [None]:
#select the model with the lowest error as your "prodcuction" model

## Part 4 - DEPLOY

### ---- 11 Automate pipeline ----

In [None]:
#write script that trains model on entire training set, saves model to disk,
#and scores the "test" dataset

### ---- 12 Deploy solution ----

In [16]:
#save your prediction to a csv file or optionally save them as a table in a SQL database
#additionally, you want to save a visualization and summary of your prediction and feature importances
#these visualizations and summaries will be extremely useful to business stakeholders

### ---- 13 Measure efficacy ----

We'll skip this step since we don't have the outcomes for the test data