## Predicting FinTech Bootcamp Graduate Salaries
##### Project 2 - Group C (Andrew, Margee, Rachel, Jinhyeong)
##### Presentation Date: January 15, 2022

#### Data Preparation

In [61]:
# Imports
import json
import pandas as pd
import numpy as np
import warnings
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
#from matplotlib import pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
warnings.filterwarnings("ignore")


In [32]:
# Create File Path
filepath = "/Users/andrewcrawford/Desktop/Project2_GroupC/"
filename = "monster_india_latest_jobs_free_dataset.json"

# Opening JSON file
f = open(filepath + filename)

# returns JSON object as a dictionary
data = json.load(f)

In [33]:
# Create data list
address_country = []
address_locality = []
address_region = []
company = []
description = []
industry = []
postal_code = []
posted_at = []
salary = []  # the target
salary_type = []
skills = []
title = []
url = []

# iterate through each element in the data list
for each in data:

    # append values to list, given each key
    address_country.append(each["address_country"])
    address_locality.append(each["address_locality"])
    address_region.append(each["address_region"])
    company.append(each["company"])
    description.append(each["description"])
    industry.append(each["industry"])
    postal_code.append(each["postal_code"])
    posted_at.append(each["posted_at"])
    salary.append(each["salary"])
    salary_type.append(each["salary_type"])
    skills.append(each["skills"]) 
    title.append(each["title"])
    url.append(each["url"])

In [34]:
# create df, using string header and list as rows
df = pd.DataFrame({"address_country": address_country
                   ,"address_locality": address_locality
                   ,"address_region": address_region
                   ,"company": company
                   ,"description": description
                   ,"industry": industry
                   ,"postal_code": postal_code
                   ,"posted_at": posted_at
                   ,"salary": salary
                   ,"salary_type": salary_type
                   ,"skills": skills
                   ,"title": title
                   ,"url": url
                  })

df["description"] = df["description"].str.lower()

In [35]:
# Preview dataframe
df.head()

Unnamed: 0,address_country,address_locality,address_region,company,description,industry,postal_code,posted_at,salary,salary_type,skills,title,url
0,IN,India,India,Kotak Mahindra Bank Limited,job description : - . client engagement o...,Banking/Accounting/Financial Services,,2021-09-16,Not disclosed,P.A.,Service Relationship Manager-POS-BRANCH BANKIN...,Service Relationship Manager-POS-BRANCH BANKIN...,https://www.monsterindia.com/job/service-relat...
1,IN,Noida,Uttar Pradesh,ANI Calls India Private Limited,job description : . axway b2bi or axway gatew...,IT/Computers - Software,,2021-09-13,1600000,P.A.,Axway B2B EDI Integration consultant,Axway B2B EDI Integration consultant,https://www.monsterindia.com/job/axway-b2b-edi...
2,IN,Chennai,Tamil Nadu,PayPal,job description : job description fueled by a...,Banking/Accounting/Financial Services,,2021-08-18,Not disclosed,P.A.,Data Engineer/MTS 2,"MTS 2, Data Engineer",https://www.monsterindia.com/job/mts-2-data-en...
3,IN,Bengaluru / Bangalore,Karnataka,ANI Calls India Private Limited,job description : at least three years weblog...,IT/Computers - Software,,2021-08-26,1800000,P.A.,Senior Oracle Fusion Middleware developer,Senior Oracle Fusion Middleware developer,https://www.monsterindia.com/job/senior-oracle...
4,IN,India,India,ANI Calls India Private Limited,job description : • linux engineer with ksh s...,IT/Computers - Software,,2021-09-08,1400000,P.A.,Korn Shell (KSH) - Linux System Engineer,Korn Shell (KSH) - Linux System Engineer,https://www.monsterindia.com/job/korn-shell-ks...


In [36]:
# Clean data, drop unnecessary columns
df.drop(['address_country', 'address_locality', 'address_region', 'postal_code', 'salary_type', 'skills', 'url'], axis=1, inplace=True)

In [37]:
df.head()

Unnamed: 0,company,description,industry,posted_at,salary,title
0,Kotak Mahindra Bank Limited,job description : - . client engagement o...,Banking/Accounting/Financial Services,2021-09-16,Not disclosed,Service Relationship Manager-POS-BRANCH BANKIN...
1,ANI Calls India Private Limited,job description : . axway b2bi or axway gatew...,IT/Computers - Software,2021-09-13,1600000,Axway B2B EDI Integration consultant
2,PayPal,job description : job description fueled by a...,Banking/Accounting/Financial Services,2021-08-18,Not disclosed,"MTS 2, Data Engineer"
3,ANI Calls India Private Limited,job description : at least three years weblog...,IT/Computers - Software,2021-08-26,1800000,Senior Oracle Fusion Middleware developer
4,ANI Calls India Private Limited,job description : • linux engineer with ksh s...,IT/Computers - Software,2021-09-08,1400000,Korn Shell (KSH) - Linux System Engineer


In [38]:
# Drop any salaries that are Not disclosed
df = df[df["salary"] != "Not disclosed"]
df.head()

Unnamed: 0,company,description,industry,posted_at,salary,title
1,ANI Calls India Private Limited,job description : . axway b2bi or axway gatew...,IT/Computers - Software,2021-09-13,1600000,Axway B2B EDI Integration consultant
3,ANI Calls India Private Limited,job description : at least three years weblog...,IT/Computers - Software,2021-08-26,1800000,Senior Oracle Fusion Middleware developer
4,ANI Calls India Private Limited,job description : • linux engineer with ksh s...,IT/Computers - Software,2021-09-08,1400000,Korn Shell (KSH) - Linux System Engineer
5,ANI Calls India Private Limited,job description : . oracle ebs apps dba imple...,IT/Computers - Software,2021-09-16,1400000,Oracle Cloud Infrastructure
7,Live Connections,requirement mulesoft developer,Recruitment/Staffing/RPO,2021-08-30,2500000,Mulesoft Developer


In [39]:
# Check if there are any NAs/nulls
df.isnull().sum()

company        0
description    0
industry       0
posted_at      0
salary         0
title          0
dtype: int64

In [40]:
# Creating Keywords for Searching
keywords_sql = "sql|oracle|database|db|dba|plsql|tsql|postgres|datawarehouse|datawarehousing|mssql|mysql|db2|mongodb"
keywords_python = "python|scripting"
keywords_api = "api"
keywords_algorithm = "algorithm|algorithmic|quant"
keywords_aws = "aws|amazon web services"
keywords_forecast = "forecast|predict"
keywords_bigdata = "bigdata|big data"

In [41]:
# Searching for keyword skills and updating columns from 0s to 1s as applicable
df["sql"] = np.where(df["description"].str.contains(keywords_sql), 1, 0)
df["python"] = np.where(df["description"].str.contains(keywords_python), 1, 0)
df["api"] = np.where(df["description"].str.contains(keywords_api), 1, 0)
df["algorithm"] = np.where(df["description"].str.contains(keywords_algorithm), 1, 0)
df["aws"] = np.where(df["description"].str.contains(keywords_aws), 1, 0)
df["forecast"] = np.where(df["description"].str.contains(keywords_forecast), 1, 0)
df["bigdata"] = np.where(df["description"].str.contains(keywords_bigdata), 1, 0)

In [42]:
# Show updated dataframe
df.head()

Unnamed: 0,company,description,industry,posted_at,salary,title,sql,python,api,algorithm,aws,forecast,bigdata
1,ANI Calls India Private Limited,job description : . axway b2bi or axway gatew...,IT/Computers - Software,2021-09-13,1600000,Axway B2B EDI Integration consultant,0,0,0,0,0,0,0
3,ANI Calls India Private Limited,job description : at least three years weblog...,IT/Computers - Software,2021-08-26,1800000,Senior Oracle Fusion Middleware developer,1,0,0,0,0,0,0
4,ANI Calls India Private Limited,job description : • linux engineer with ksh s...,IT/Computers - Software,2021-09-08,1400000,Korn Shell (KSH) - Linux System Engineer,0,1,0,0,0,0,0
5,ANI Calls India Private Limited,job description : . oracle ebs apps dba imple...,IT/Computers - Software,2021-09-16,1400000,Oracle Cloud Infrastructure,1,0,0,0,1,0,0
7,Live Connections,requirement mulesoft developer,Recruitment/Staffing/RPO,2021-08-30,2500000,Mulesoft Developer,0,0,0,0,0,0,0


In [43]:
# Drop all rows where all keywords equal 0
df_updated = df[~(df[['sql','python','api','algorithm','aws','forecast','bigdata']] == 0).all(axis=1)]
df_updated.reset_index(drop=True, inplace=True)
df_updated.head()

Unnamed: 0,company,description,industry,posted_at,salary,title,sql,python,api,algorithm,aws,forecast,bigdata
0,ANI Calls India Private Limited,job description : at least three years weblog...,IT/Computers - Software,2021-08-26,1800000,Senior Oracle Fusion Middleware developer,1,0,0,0,0,0,0
1,ANI Calls India Private Limited,job description : • linux engineer with ksh s...,IT/Computers - Software,2021-09-08,1400000,Korn Shell (KSH) - Linux System Engineer,0,1,0,0,0,0,0
2,ANI Calls India Private Limited,job description : . oracle ebs apps dba imple...,IT/Computers - Software,2021-09-16,1400000,Oracle Cloud Infrastructure,1,0,0,0,1,0,0
3,ANI Calls India Private Limited,job description : . principal engineer cloud ...,IT/Computers - Software,2021-08-23,1400000,Infrastructure as a Service (IaaS),1,0,0,0,0,0,0
4,ANI Calls India Private Limited,job description : . extensive enterprise cons...,IT/Computers - Software,2021-08-30,1800000,SFDC Architect consultant,0,0,1,0,0,0,0


In [44]:
# Convert salary data type before currency conversion
df_updated['salary'] = df_updated['salary'].astype('int')
df_updated.dtypes

company        object
description    object
industry       object
posted_at      object
salary          int64
title          object
sql             int64
python          int64
api             int64
algorithm       int64
aws             int64
forecast        int64
bigdata         int64
dtype: object

In [45]:
# Convert Indian rupees to US dollars (conversion rate is as of 1/8/2022)
df_updated['salary'] = round(df_updated['salary']/74.28, 2)
df_updated.drop(columns=['company', 'description', 'industry', 'posted_at', 'title'], inplace=True)
df_updated.head()

Unnamed: 0,salary,sql,python,api,algorithm,aws,forecast,bigdata
0,24232.63,1,0,0,0,0,0,0
1,18847.6,0,1,0,0,0,0,0
2,18847.6,1,0,0,0,1,0,0
3,18847.6,1,0,0,0,0,0,0
4,24232.63,0,0,1,0,0,0,0


In [46]:
# Determine data count
df_updated.shape

(177, 8)

In [47]:
# Review new data set
df_updated.describe()

Unnamed: 0,salary,sql,python,api,algorithm,aws,forecast,bigdata
count,177.0,177.0,177.0,177.0,177.0,177.0,177.0,177.0
mean,19003.448927,0.59887,0.259887,0.276836,0.101695,0.135593,0.050847,0.056497
std,8038.691075,0.491518,0.439817,0.448704,0.303104,0.343327,0.220309,0.231534
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16155.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,18847.6,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,21540.12,1.0,1.0,1.0,0.0,0.0,0.0,0.0
max,46445.88,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [49]:
# TUESDAY CLASS - MY EDITS BEGIN HERE

# Now we need to create a column at the end that totals up the number of 1's in each row
# This will help us determine if higher salaries correspond with more skills
# New dataframe will simply be two columns - salary and total_skillls

df_updated['total_skills'] = df_updated[['sql','python','api','algorithm','aws','forecast','bigdata']].sum(axis=1)

In [50]:
# Preview updated dataframe with total_skills column
df_updated

Unnamed: 0,salary,sql,python,api,algorithm,aws,forecast,bigdata,total_skills
0,24232.63,1,0,0,0,0,0,0,1
1,18847.60,0,1,0,0,0,0,0,1
2,18847.60,1,0,0,0,1,0,0,2
3,18847.60,1,0,0,0,0,0,0,1
4,24232.63,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
172,24232.63,0,1,0,0,0,0,0,1
173,24232.63,1,1,0,0,0,0,0,2
174,33656.44,1,0,0,0,0,0,0,1
175,201.94,0,0,1,0,0,0,0,1


In [51]:
# Now we drop the other columns
# df_updated.drop(columns=['sql','python','api','algorithm','aws','forecast','bigdata'], inplace=True)

# No need to drop - change the rolling window to be a standard 70/30 split. 

In [None]:
# Preview new dataframe
df_updated.describe()

#### Training and Testing

In [None]:
# Next step is to train/test the data!
# salary = y, dummy columns (SQL, etc) are x, split data, train on first 70% of observations with number regression
# Run model on test, see if it performs
# Regression - Predict salary based on skills learned in class OR Classification - cutoff point for salary
# Optional Chatbot of what skills do you possess = salary range in rupees

In [52]:
# Set the random seed
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

In [57]:
# Assign X
X = df_updated[['sql','python','api', 'algorithm','aws', 'forecast','bigdata','total_skills']]
y = df_updated['salary']
X.head(10)

Unnamed: 0,sql,python,api,algorithm,aws,forecast,bigdata,total_skills
0,1,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,1
2,1,0,0,0,1,0,0,2
3,1,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,1
5,0,0,1,0,0,0,0,1
6,1,0,1,0,0,0,0,2
7,0,1,0,0,0,0,0,1
8,1,0,0,0,0,0,0,1
9,1,0,1,0,0,0,0,2


In [69]:
x_train, x_test,y_train,y_test = train_test_split(X,y,train_size=0.7, test_size =0.3, random_state = 100)

# print the data
x_train

Unnamed: 0,sql,python,api,algorithm,aws,forecast,bigdata,total_skills
156,0,0,1,1,0,1,0,3
12,0,0,0,0,0,1,0,1
16,0,1,0,0,0,0,0,1
77,0,0,1,0,0,0,0,1
162,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...
87,1,0,0,0,0,0,0,1
103,0,0,1,0,0,0,0,1
67,1,1,0,1,1,0,1,5
24,1,0,0,0,0,0,0,1


In [63]:
clf = LinearRegression()

In [64]:
clf.fit(x_train,y_train)

LinearRegression()

In [65]:
clf.predict(x_test)

array([18133.53937019, 18520.58895691, 18565.94032005, 24795.7632602 ,
       18565.94032005, 18565.94032005, 18565.94032005, 24795.7632602 ,
       18133.53937019, 17721.91409378, 18133.53937019, 14768.84323682,
       18565.94032005, 17721.91409378, 18565.94032005, 18565.94032005,
       14357.21796042, 18749.07727668, 18565.94032005, 20071.04912865,
       25411.3011667 , 24567.27494043, 18565.94032005, 18133.53937019,
       18337.45200028, 17721.91409378, 18565.94032005, 19455.51122216,
       20071.04912865, 18565.94032005, 18133.53937019, 18565.94032005,
       18565.94032005, 18749.07727668, 20071.04912865, 25182.81284693,
       18565.94032005, 18337.45200028, 20025.69776552, 17721.91409378,
       18749.07727668, 19455.51122216, 17721.91409378, 19447.81724306,
       19842.56080888, 18565.94032005, 13970.16837369, 18133.53937019,
       24567.27494043, 24795.7632602 , 17721.91409378, 18565.94032005,
       18565.94032005, 24795.7632602 ])

In [66]:
clf.score(x_test,y_test)

-0.35260466277981917

In [None]:
# Use 70% of the data for training and the remainder for testing

split = int(0.7 * len(X))

X_train = X[: split]
X_test = X[split:]

y_train = y[: split]
y_test = y[split:]
split

In [None]:
# Use the MinMaxScaler to scale data between 0 and 1.

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the MinMaxScaler object with the features data X
scaler.fit(X_train)

# Scale the features training and testing sets
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Fit the MinMaxScaler object with the target data Y
scaler.fit(y_train)

# Scale the target training and testing sets
y_train = scaler.transform(y_train)
y_test = scaler.transform(y_test)

In [None]:
# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Print some sample data after reshaping the datasets
print (f"X_train sample values:\n{X_train[:3]} \n")
print (f"X_test sample values:\n{X_test[:3]}")

### Build and Train the Model

In [None]:
# Compile the model
model.compile(optimizer="adam", loss="mean_squared_error")

In [None]:
# Summarize the model
model.summary()

In [None]:
# Train the model
# Use at least 10 epochs
# Do not shuffle the data
# Experiment with the batch size, but a smaller batch size is recommended

model.fit(X_train, y_train, epochs=10, shuffle=False, batch_size=90, verbose=1)

#### Model Performance

In [68]:
# Evaluate the model
model.evaluate(X_test, y_test, verbose=0)

AttributeError: 'LinearRegression' object has no attribute 'evaluate'

In [None]:
# Make some predictions
predicted = model.predict(X_test)

In [None]:
# Recover the original prices instead of the scaled version
predicted_salary = scaler.inverse_transform(predicted)
real_salary = scaler.inverse_transform(y_test.reshape(-1, 1))

In [None]:
# Create a DataFrame of Real and Predicted values
salaries = pd.DataFrame({
    "Actual": real_salary.ravel(),
    "Predicted": predicted_salary.ravel()
}, index = df_updated.index[-len(real_salary): ]) 

# Show the DataFrame's head
salaries.head()

In [None]:
# Plot the real vs predicted values as a line chart
salaries.plot(title="Actual Vs. Predicted Salaries")

### DASHBOARD

#### We created a dashboard for the presentation. The dashboard was saved as an html.

##### First Step: Create Markdowns for Overview Tabs

In [None]:
markdown_titlepage = """

# Predicting FinTech Bootcamp Graduate Salaries  
  
Presentation Date: January 15, 2022 
    
Prepared by Andrew, Rachel, Margee, and Jinhyeong  
  
![image](./images/XXXX.png)

"""

In [None]:
markdown_intro = """
# Project Objective  
  
Our project is to research and learn what salaries are possible with skills obtained in this FinTech Bootcamp.  
We will be using the Monster India API which includes salaries in rupees since the US-based API required a  
significant fee to use.  We will use the API to retrieve data regarding job descriptions, skills, and salaries  
to determine the following:  
- What jobs require skills from this bootcamp?  
- What are the potential jobs and salaries for graduates with these skills?  
- Can we predict an accurate salary range based on these skills?  

  
![image](./images/XXXX.png)
"""

In [None]:
markdown_background1 = """
# What Skills Have We Gained as FinTech Bootcamp Students?
  
As part of the U of MN FinTech Bootcamp, we have obtained the necessary skills to automate  
and improve financial services using cutting-edge technology. Skills gained inclde the following:
  
![image](./images/Skills.png)

"""

In [None]:
markdown_background2 = """
# Background of Key Skills Used During This Project
  
SQL  
Python  
API  
Algorithm  
AWS  
Forecast  
Big data  
  
  
![image](./XX.png)

"""

In [None]:
markdown_end = """

# Conclusion  
  
Based on our analysis, FinTech Bootcamp graduates can potentially earn a salary between X and X.
  

  
  
![image](./images/XX.png)
"""

In [None]:
dataphases = """

# Data Phases  
  
Our process consisted of a data exploration, cleanup, and analysis phases.  
  
>- Exploration: Google seaches, API searches, Monster India API, FinTech Bootcamp Curriculum  
>- Cleanup: Searching through variables in Spyder, selecting key data, dropping unnecessary information, narrowing down jobs
>- Analysis: Looking through the data to find jobs that include at least one key skill
>- Training/Testing:
>- Create Model to Predict Salary based on Skills:
>- Fun ChatBot:
  
"""

##### Second Step: Fix any dashboard issues

In [None]:
#Some of our plots did not translate well in the dashboard, so we converted them to pngs and used the image
plot_1 ="""
![image](./images/plot_1.png)
"""

##### Third Step: Create the Dashboard

In [None]:
#Create Dashboard
pn.Tabs(
    ("XX",markdown_titlepage),
    ("XX",markdown_background1),
    ("XX",markdown_background2),
    ("XX",markdown_background3),
    ("XX",markdown_background4),
    ("XX",visualoverview),
    ("XX",plot_1),
    ("XX",plot_2),
    ("XX",plot_3),
    ("XX",plot_4),
    ("XX",plot_5),
    ("XX",plot_6),
    ("XX",plot_7),
    ("XX",markdown_end)
       )

##### Final Step: Save to html for presentation

In [None]:
#Create an html of the dashboard for presentation purposes once finalized
pn.Tabs(
    ("XX",markdown_titlepage),
    ("XX",markdown_background1),
    ("XX",markdown_background2),
    ("XX",markdown_background3),
    ("XX",markdown_background4),
    ("XX",visualoverview),
    ("XX",plot_1),
    ("XX",plot_2),
    ("XX",plot_3),
    ("XX",plot_4),
    ("XX",plot_5),
    ("XX",plot_6),
    ("XX",plot_7),
    ("XX",markdown_end)
       ).save("Dashboard_Presentation.html", embed=True)