# Exploratory Data Analysis

## Import Libraries

In [2]:
import pandas as pd

## Load Dataset

In [72]:
post_df = pd.read_csv('data/job_postings.csv')
key_info_post_df = post_df[['job_link', 'job_title', 'company', 'job_location', 'first_seen', 'search_city', 'search_country', 'job_type']].copy()
key_info_post_df.head()

Unnamed: 0,job_link,job_title,company,job_location,first_seen,search_city,search_country,job_type
0,https://www.linkedin.com/jobs/view/senior-mach...,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Onsite
1,https://www.linkedin.com/jobs/view/principal-s...,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Onsite
2,https://www.linkedin.com/jobs/view/senior-etl-...,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Onsite
3,https://www.linkedin.com/jobs/view/senior-data...,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Onsite
4,https://www.linkedin.com/jobs/view/lead-data-e...,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Onsite


In [73]:
skill_df = pd.read_csv('data/job_skills.csv')
skill_df.head()

Unnamed: 0,job_link,job_skills
0,https://www.linkedin.com/jobs/view/senior-mach...,"Machine Learning, Programming, Python, Scala, ..."
1,https://www.linkedin.com/jobs/view/principal-s...,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,https://www.linkedin.com/jobs/view/senior-etl-...,"ETL, Data Integration, Data Transformation, Da..."
3,https://www.linkedin.com/jobs/view/senior-data...,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,https://www.linkedin.com/jobs/view/lead-data-e...,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."


In [74]:
merged_df = key_info_post_df.merge(skill_df, on='job_link')
merged_df.head()

Unnamed: 0,job_link,job_title,company,job_location,first_seen,search_city,search_country,job_type,job_skills
0,https://www.linkedin.com/jobs/view/senior-mach...,Senior Machine Learning Engineer,Jobs for Humanity,"New Haven, CT",2024-01-14,East Haven,United States,Onsite,"Machine Learning, Programming, Python, Scala, ..."
1,https://www.linkedin.com/jobs/view/principal-s...,"Principal Software Engineer, ML Accelerators",Aurora,"San Francisco, CA",2024-01-14,El Cerrito,United States,Onsite,"C++, Python, PyTorch, TensorFlow, MXNet, CUDA,..."
2,https://www.linkedin.com/jobs/view/senior-etl-...,Senior ETL Data Warehouse Specialist,Adame Services LLC,"New York, NY",2024-01-14,Middletown,United States,Onsite,"ETL, Data Integration, Data Transformation, Da..."
3,https://www.linkedin.com/jobs/view/senior-data...,Senior Data Warehouse Developer / Architect,Morph Enterprise,"Harrisburg, PA",2024-01-12,Lebanon,United States,Onsite,"Data Lakes, Data Bricks, Azure Data Factory Pi..."
4,https://www.linkedin.com/jobs/view/lead-data-e...,Lead Data Engineer,Dice,"Plano, TX",2024-01-14,McKinney,United States,Onsite,"Java, Scala, Python, RDBMS, NoSQL, Redshift, S..."


In [75]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12217 entries, 0 to 12216
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   job_link        12217 non-null  object
 1   job_title       12217 non-null  object
 2   company         12217 non-null  object
 3   job_location    12216 non-null  object
 4   first_seen      12217 non-null  object
 5   search_city     12217 non-null  object
 6   search_country  12217 non-null  object
 7   job_type        12217 non-null  object
 8   job_skills      12212 non-null  object
dtypes: object(9)
memory usage: 954.5+ KB


## Filter Jobs with Data Science/Scientist Title

In [76]:
titles = ['Data Sci']
filter_jobs = merged_df[(merged_df['job_title'].str.contains('|'.join(titles))) & (merged_df['search_country'] == 'United States')].copy()
len(filter_jobs)

880

In [65]:
filter_jobs['job_title'].value_counts()

Data Scientist                             106
Senior Data Scientist                       98
Lead Data Scientist                         37
Sr. Data Scientist                          21
Sr Data Scientist                           18
                                          ... 
Data Scientist III (Applied Research)        1
Data Scientist - Back End Data Engineer      1
Machine Learning Data Scientist              1
Data Science Developer                       1
Senior Data Scientist - Statistics           1
Name: job_title, Length: 448, dtype: int64

### Group by Job Title

In [115]:
pr_he_ma_df = filter_jobs[filter_jobs['job_title'].str.contains('|'.join(['Principal', 'Head', 'Manager']))].copy()
len(pr_he_ma_df), pr_he_ma_df['job_title'].value_counts()

(106,
 Data Science Manager, Growth                               9
 Data Science Manager                                       8
 Principal Data Scientist                                   8
 Senior Manager, Data Science                               4
 Data Scientist, Senior Manager                             4
                                                           ..
 Principal Statistician/Data Scientist                      1
 Principal Clinical Data Scientist - Focus on Statistics    1
 Senior Manager, Data Science, Imaging AI                   1
 Senior Manager Data Science (multiple openings) - IHM      1
 Principal Data Scientist, Community                        1
 Name: job_title, Length: 68, dtype: int64)

In [116]:
ds_df = filter_jobs[filter_jobs['job_title'].str.contains('|'.join(['^Data Sci', 'Staff Data Sci']))].copy()
ds_df = ds_df[~ds_df.isin(pr_he_ma_df)].dropna(how='all')
len(ds_df), ds_df['job_title'].value_counts()

(251,
 Data Scientist                                                  106
 Data Scientist/Senior Data Scientist                             12
 Staff Data Scientist                                              8
 Data Scientist – Fraud Specialist                                 5
 Data Scientist Lead - Property & Casualty Loss/Risk Modeling      5
                                                                ... 
 Data Scientist - Back End Data Engineer                           1
 Data Science Developer                                            1
 Data Science and Analyst                                          1
 Data Scientist, Supply and Operation Technology                   1
 Staff Data Scientist - LLM                                        1
 Name: job_title, Length: 97, dtype: int64)

In [117]:
sr_ds_df = filter_jobs[filter_jobs['job_title'].str.contains('|'.join(['^Se', '^Sr']))].copy()
sr_ds_df = sr_ds_df[~sr_ds_df.isin(pr_he_ma_df)].dropna(how='all')
len(sr_ds_df), sr_ds_df['job_title'].value_counts()

(307,
 Senior Data Scientist                                                             98
 Sr. Data Scientist                                                                21
 Sr Data Scientist                                                                 18
 Senior Data Scientist with Security Clearance                                      8
 Senior Data Scientist, Product Growth                                              5
                                                                                   ..
 Sr. Data Scientist, Amazon Robotics (AR)                                           1
 Sr. Data Scientist (1020250)                                                       1
 Sr. Human Resource Business Partner (Technology, Engineering and Data Science)     1
 Senior Associate Data Scientist - Machine Learning, Financial Services             1
 Senior Data Scientist - Statistics                                                 1
 Name: job_title, Length: 136, dtype: int64)

In [118]:
le_ds_df = filter_jobs[filter_jobs['job_title'].str.contains('|'.join(['^Lead']))].copy()
le_ds_df = le_ds_df[~le_ds_df.isin(pr_he_ma_df)].dropna(how='all')
len(le_ds_df), le_ds_df['job_title'].value_counts()

(47,
 Lead Data Scientist                                            37
 Lead Data Science Analyst (multiple openings) - IHM             2
 Lead Data Scientist with Security Clearance                     1
 Lead Data Scientist - Clinical                                  1
 Lead Data Scientist (Hybrid)                                    1
 Lead Data Science Analyst (Multiple openings) - IHM             1
 Lead Data Scientist- AD Tech                                    1
 Lead Data Scientist Engineer                                    1
 Lead Data Scientist - Top Regional Bank                         1
 Lead Data Scientist - Property & Casualty Insurance Pricing     1
 Name: job_title, dtype: int64)

In [120]:
ot_ds_df = filter_jobs[(~filter_jobs.isin(pr_he_ma_df)) & (~filter_jobs.isin(ds_df)) & (~filter_jobs.isin(sr_ds_df)) & (~filter_jobs.isin(le_ds_df))].dropna().copy()
len(ot_ds_df), ot_ds_df['job_title'].value_counts()

(175,
 (Global Oil Gas) Senior Data Scientist Expert                     13
 (USA) Senior, Data Scientist                                       6
 Cleared Data Scientist - Workforce Analytics                       3
 Assistant Professor of Data Science                                3
 Assistant Professor of Statistics & Data Science                   2
                                                                   ..
 Environmental Data Scientist                                       1
 Statistician/Data Scientist Fellowship                             1
 Asst Prof Computer Science in Data Science &                       1
 Assistant/Associate Professor - Biostatistics and Data Science     1
 Assistant Professor, Statistics and Data Science                   1
 Name: job_title, Length: 141, dtype: int64)

In [124]:
filter_jobs[(filter_jobs.isin(pr_he_ma_df)) & (filter_jobs.isin(ds_df)) & (filter_jobs.isin(sr_ds_df)) & (filter_jobs.isin(le_ds_df)))].dropna()

Unnamed: 0,job_link,job_title,company,job_location,first_seen,search_city,search_country,job_type,job_skills


In [121]:
len(ds_df) + len(sr_ds_df) + len(le_ds_df) + len(ot_ds_df) + len(pr_he_ma_df)

886

In [122]:
len(filter_jobs)

880