In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib
from wordcloud import WordCloud
from pathlib import Path


In [2]:
#Step One - Load Data and Evaualte quality for use

In [3]:
# Load the full set of data
glassdoor_df=pd.read_csv('glassdoor.csv')
glassdoor_df

# Preview the salary data
glassdoor_df.head()

In [31]:
glassdoor_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165290 entries, 0 to 165289
Columns: 163 entries, benefits.benefitRatingDecimal to wwfu
dtypes: bool(11), float64(35), int64(23), object(94)
memory usage: 193.4+ MB


In [4]:
 # Retrieve DataFrame data types
glassdoor_df.dtypes

benefits.benefitRatingDecimal       float64
benefits.comments                   float64
benefits.highlights                 float64
benefits.numRatings                   int64
benefits.employerSummary             object
                                     ...   
salary.currency.positiveTemplate     object
salary.currency.symbol               object
salary.lastSalaryDate                object
salary.salaries                     float64
wwfu                                float64
Length: 163, dtype: object

In [9]:
# Identify Series count
glassdoor_df.count()

benefits.benefitRatingDecimal       165290
benefits.comments                   165289
benefits.highlights                 165289
benefits.numRatings                 165290
benefits.employerSummary              5749
                                     ...  
salary.currency.positiveTemplate     76777
salary.currency.symbol               76777
salary.lastSalaryDate                76777
salary.salaries                     147149
wwfu                                 18741
Length: 163, dtype: int64

In [10]:
# Identify frequency values - employer name
glassdoor_df["gaTrackerData.empName"].value_counts()

Amazon                2494
Oracle                1064
Dell Technologies      875
Hays                   817
Citi                   620
                      ... 
Adarma                   1
Sanoma Corporation       1
Eurostep                 1
ExamSoft Worldwide       1
StepStone Group          1
Name: gaTrackerData.empName, Length: 23459, dtype: int64

In [11]:
# Identify frequency values - employer name
glassdoor_df["gaTrackerData.empSize"].value_counts()

10000--1      48801
1001-5000     20852
51-200        18202
1-50          17905
201-500       12135
-1-0          10532
501-1000       9371
5001-10000     9283
Name: gaTrackerData.empSize, dtype: int64

In [17]:
# Identify frequency values - data category?
glassdoor_df["gaTrackerData.category"].value_counts()

-1        129088
 0          7047
 10007      3487
 10132      3431
 10111      2369
           ...  
 20094         1
 20102         1
 20080         1
 20061         1
 10061         1
Name: gaTrackerData.category, Length: 266, dtype: int64

In [18]:
# Identify frequency values - data category?
glassdoor_df["gaTrackerData.industry"].value_counts()

Computer Hardware & Software               13971
IT Services                                11544
Internet                                    9292
Staffing & Outsourcing                      8596
Enterprise Software & Network Solutions     8242
                                           ...  
Timber Operations                              2
Talent & Modeling Agencies                     2
Veterinary Services                            2
Parking Lots & Garages                         2
Auctions & Galleries                           1
Name: gaTrackerData.industry, Length: 137, dtype: int64

In [19]:
# Identify frequency values - data category?
glassdoor_df["gaTrackerData.jobTitle"].value_counts()

Project Manager                                               4340
Software Engineer                                             3250
Business Analyst                                              2429
Product Manager                                               2125
Data Scientist                                                1955
                                                              ... 
Lead Infrastructure Engineer                                     1
SYSTEM ENGINEER UNIX - APPLICATION and INTERNET Management       1
IT Asset Management Analyst                                      1
Data Analytic Consultant                                         1
(Senior) Consultant Data Engineering Big Data (m/w/d)            1
Name: gaTrackerData.jobTitle, Length: 66214, dtype: int64

In [12]:
 # Check for null values
glassdoor_df.isnull()

Unnamed: 0,benefits.benefitRatingDecimal,benefits.comments,benefits.highlights,benefits.numRatings,benefits.employerSummary,breadCrumbs,gaTrackerData.category,gaTrackerData.empId,gaTrackerData.empName,gaTrackerData.empSize,...,salary.currency.displayName,salary.currency.id,salary.currency.name,salary.currency.negativeTemplate,salary.currency.new,salary.currency.positiveTemplate,salary.currency.symbol,salary.lastSalaryDate,salary.salaries,wwfu
0,False,False,False,False,True,False,False,False,False,False,...,True,True,True,True,True,True,True,True,False,True
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,True,False,False,False,False,False,...,True,True,True,True,True,True,True,True,False,True
3,False,False,False,False,True,False,False,False,False,False,...,True,True,True,True,True,True,True,True,False,True
4,False,False,False,False,True,False,False,False,False,False,...,True,True,True,True,True,True,True,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165285,False,False,False,False,True,False,False,False,False,False,...,True,True,True,True,True,True,True,True,False,True
165286,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
165287,False,False,False,False,True,False,False,False,False,False,...,True,True,True,True,True,True,True,True,False,True
165288,False,False,False,False,True,False,False,False,False,False,...,True,True,True,True,True,True,True,True,False,True


In [13]:
 # Check duplicates
glassdoor_df.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
165285    False
165286    False
165287    False
165288    False
165289    False
Length: 165290, dtype: bool

In [16]:
glassdoor_df.describe()

Unnamed: 0,benefits.benefitRatingDecimal,benefits.comments,benefits.highlights,benefits.numRatings,breadCrumbs,gaTrackerData.category,gaTrackerData.empId,gaTrackerData.industryId,gaTrackerData.jobId.long,gaTrackerData.jobId.int,...,reviews,salary.country.continent.id,salary.country.currency.defaultFractionDigits,salary.country.currency.id,salary.country.id,salary.country.population,salary.currency.defaultFractionDigits,salary.currency.id,salary.salaries,wwfu
count,165290.0,165289.0,165289.0,165290.0,165290.0,165290.0,165290.0,165290.0,165044.0,246.0,...,165290.0,76777.0,76777.0,76777.0,76777.0,76777.0,76777.0,76777.0,147149.0,18741.0
mean,1.38769,82645.0,82645.0,17.650342,82645.5,2476.188614,694581.2,142681.566731,3329226000.0,1754135000.0,...,82645.5,4.0,2.0,2.0,2.0,60363602.0,2.0,2.037238,73575.0,9371.0
std,1.826381,47714.968659,47714.968659,49.901598,47715.257335,6098.395811,886207.5,90479.876651,116765700.0,347324600.0,...,47715.257335,0.0,0.0,0.0,0.0,0.0,0.0,1.292106,42478.401718,5410.205033
min,0.0,1.0,1.0,0.0,1.0,-1.0,0.0,0.0,2208549000.0,327694900.0,...,1.0,4.0,2.0,2.0,2.0,60363602.0,2.0,1.0,1.0,1.0
25%,0.0,41323.0,41323.0,0.0,41323.25,-1.0,5632.0,0.0,3303018000.0,1756631000.0,...,41323.25,4.0,2.0,2.0,2.0,60363602.0,2.0,2.0,36788.0,4686.0
50%,0.0,82645.0,82645.0,0.0,82645.5,-1.0,304392.0,200048.0,3370946000.0,1904030000.0,...,82645.5,4.0,2.0,2.0,2.0,60363602.0,2.0,2.0,73575.0,9371.0
75%,3.504717,123967.0,123967.0,6.0,123967.75,-1.0,1107358.0,200063.0,3399873000.0,1957060000.0,...,123967.75,4.0,2.0,2.0,2.0,60363602.0,2.0,2.0,110362.0,14056.0
max,5.0,165289.0,165289.0,596.0,165290.0,30165.0,3102014.0,200147.0,3412782000.0,2022056000.0,...,165290.0,4.0,2.0,2.0,2.0,60363602.0,2.0,47.0,147149.0,18741.0


In [14]:
# got to https://www.kaggle.com/andresionek/data-jobs-listings-glassdoor?select=glassdoor.csv to see full description for each column

In [15]:
 # Drop unnecessary columns and rows to shrink data set
    #columns to drop and joins
    #drops
    #breadCrumbs - don't seem to need table
    #gaTrackerData.expired - don't seem to need table

    #joins
    #benefits.comments - ID to glassdoor_benefits_comments.csv table
    


In [21]:
# Explore job titles - column exploration
glassdoor_df["gaTrackerData.jobTitle"].value_counts()

Project Manager                                               4340
Software Engineer                                             3250
Business Analyst                                              2429
Product Manager                                               2125
Data Scientist                                                1955
                                                              ... 
Lead Infrastructure Engineer                                     1
SYSTEM ENGINEER UNIX - APPLICATION and INTERNET Management       1
IT Asset Management Analyst                                      1
Data Analytic Consultant                                         1
(Senior) Consultant Data Engineering Big Data (m/w/d)            1
Name: gaTrackerData.jobTitle, Length: 66214, dtype: int64

In [22]:
# Explore job titles - column exploration
glassdoor_df["header.jobTitle"].value_counts()

Project Manager                                               4340
Software Engineer                                             3250
Business Analyst                                              2429
Product Manager                                               2125
Data Scientist                                                1955
                                                              ... 
Lead Infrastructure Engineer                                     1
SYSTEM ENGINEER UNIX - APPLICATION and INTERNET Management       1
IT Asset Management Analyst                                      1
Data Analytic Consultant                                         1
(Senior) Consultant Data Engineering Big Data (m/w/d)            1
Name: header.jobTitle, Length: 66214, dtype: int64

In [24]:
# Explore job titles - column exploration
glassdoor_df["header.normalizedJobTitle"].value_counts()

data analyst                              1864
project manager                           1856
business analyst                          1752
data scientist                            1625
software engineer                         1584
                                          ... 
Senior propriétaire du produit Manager       1
content analyst                              1
energie-ingenieur                            1
junior software developer                    1
junior sales analyst                         1
Name: header.normalizedJobTitle, Length: 2079, dtype: int64

In [26]:
# Explore countries - column exploration
glassdoor_df["map.country"].value_counts()

BE                      4587
IN                      4147
NL                      4134
DE                      3829
FR                      3640
                        ... 
France-Saint-Germain       1
HONG KONG                  1
MY - Malaysia              1
ZM                         1
CM                         1
Name: map.country, Length: 503, dtype: int64

In [27]:
# Explore countries - column exploration
glassdoor_df["rating.starRating"].value_counts()

-0.10    15184
 3.80    14606
 4.00    11077
 3.60     9971
 3.70     9575
         ...  
 1.85        1
 4.64        1
 3.87        1
 2.69        1
 3.09        1
Name: rating.starRating, Length: 189, dtype: int64

In [28]:
# Load the review data
reviews_df=pd.read_csv('glassdoor_reviews.csv')
reviews_df

# Preview the data
reviews_df.head()
#to analyze and model sentiment features of job reviews that are predicitive of job score

Unnamed: 0,id,index,reviews.val.cons,reviews.val.date,reviews.val.featured,reviews.val.helpfulCount,reviews.val.id,reviews.val.pros,reviews.val.publishedOn,reviews.val.publisher,...,reviews.val.reviewerJobTitle,reviews.val.reviewerLocation,reviews.val.reviewerStatus,reviews.val.summaryPoints.ceoApproval,reviews.val.summaryPoints.outlook,reviews.val.summaryPoints.recommend,reviews.val.title,reviews.val.adviceToManagement,reviews.val.companyResponse,reviews.val.reviewResponses
0,44001,0.0,Still not big enough in market place,"Oct 20, 2010",False,0.0,689142.0,"Great brand , Good leadership , Clear business...","Oct 20, 2010",Director,...,Director,Tokyo (Japan),Former Employee,1.0,0.0,1.0,American Express Japan - Excellent place to work,,,
1,44002,,,,,,,,,,...,,,,,,,,,,
2,44003,,,,,,,,,,...,,,,,,,,,,
3,44004,0.0,Nothing important on my point of view.,4 weeks ago,False,0.0,29659857.0,"Learn new technologies, helpful people, good m...",4 weeks ago,Software Engineer(Internship),...,Software Engineer(Internship),Alexandria (Egypt),Former Employee,1.0,1.0,1.0,one of the best places to be intern in,,,
4,44004,1.0,Alot of friends working together which isn't v...,"May 29, 2019",False,0.0,26381884.0,Very good opportunities to learn technologies,"May 29, 2019",Network Engineer,...,Network Engineer,Alexandria (Egypt),Current Employee,1.0,1.0,1.0,Very Good Company,,,


In [30]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422180 entries, 0 to 422179
Data columns (total 28 columns):
 #   Column                                         Non-Null Count   Dtype  
---  ------                                         --------------   -----  
 0   id                                             422180 non-null  int64  
 1   index                                          343221 non-null  float64
 2   reviews.val.cons                               343214 non-null  object 
 3   reviews.val.date                               343221 non-null  object 
 4   reviews.val.featured                           343221 non-null  object 
 5   reviews.val.helpfulCount                       343221 non-null  float64
 6   reviews.val.id                                 343221 non-null  float64
 7   reviews.val.pros                               343221 non-null  object 
 8   reviews.val.publishedOn                        343221 non-null  object 
 9   reviews.val.publisher                

In [29]:
# Load the salary data
salary_df=pd.read_csv('glassdoor_salary_salaries.csv')
salary_df

# Preview the salary data
salary_df.head()
#to analyze job score of job title by salary (normalized by currency or US/CAD only)

Unnamed: 0,id,index,salary.salaries.val.basePayCount,salary.salaries.val.jobTitle,salary.salaries.val.payPeriod,salary.salaries.val.salaryPercentileMap.payPercentile10,salary.salaries.val.salaryPercentileMap.payPercentile90,salary.salaries.val.salaryPercentileMap.payPercentile50,salary.salaries.val.salaryType
0,1,,,,,,,,
1,2,0.0,9.0,Advanced Consultant,ANNUAL,32257.68,41971.34,38713.25,employer
2,2,1.0,7.0,Consultant,ANNUAL,28855.3,74022.0,38602.2,employer
3,2,2.0,7.0,Software Engineer,ANNUAL,30377.86,38750.0,35032.53,employer
4,2,3.0,4.0,Project Manager,ANNUAL,38254.7,48675.67,45636.77,employer
