## Import necessary libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import spacy
from collections import Counter
import warnings

warnings.filterwarnings("ignore")
sns.set_style("darkgrid")

## Exploratory Data Analysis

Let's load the data and look at few entries.

In [None]:
df = pd.read_csv("/kaggle/input/data-analyst-jobs/DataAnalyst.csv")
df.head()

In [None]:
# drop the column 'Unnamed: 0'
del df["Unnamed: 0"]

In [None]:
print("The shape of our dataframe is : {}".format(df.shape))

What are the types of different features present in the dataset?

In [None]:
df.dtypes

Except 'Rating' and 'Founded' all other features are defined as 'object' type. Let's find out how many features have null values.

In [None]:
df.isnull().sum()

Only the column 'Company Name' has one missing value. But when we look at the data closely, we find that there are some entries like '-1',-1, 'Unknown' in the dataframe. These values are equivalent to NaNs. An easy way to find this out is to look at the unique elements in each column. 

In [None]:
# number of unique elements in each column
df.nunique()

Let's look at unique values in some of these columns.

In [None]:
df['Rating'].unique()

In [None]:
df['Size'].unique()

So, we replace all these above mentioned entries by NaNs.

In [None]:
df.replace([-1.,-1, '-1', 'Unknown', 'Unknown / Non-Applicable'], np.nan, inplace=True)

In [None]:
# check our dataframe after the replacement
df.head()

After this replacement, we check again the number of null values in each of the features.

In [None]:
# calculate % of null value in each column and sort them in descending order
null_percentage = df.isnull().sum().sort_values(ascending=False)/len(df)*100

# make a plot
null_percentage[null_percentage>0.1].plot(kind='bar', figsize=(10,8))
plt.xlabel("Features")
plt.ylabel("% of null values");

As we can see from the above plot, most of the entries in 'Easy Apply' and 'Competitors' are missing. Therefore, we will drop these two columns. 

In [None]:
df.drop(['Easy Apply', 'Competitors'], axis=1, inplace=True)
print("The shape of the dataframe after dropping features : {}".format(df.shape))

Let's now calculate null value percentage in each row of the dataframe.

In [None]:
# calculate % of null value in each row and sort them in descending order
row_null_percentage = df.isnull().sum(axis=1).sort_values(ascending=False)/len(df)*100
row_null_percentage

None of the row has significant amount of null values and therefore no row can be dropped on the basis of null value percentage. 

The first question that one can ask is that **which industries are hiring data analysts actively at present?** 

In [None]:
# top 30 industries hiring data analysts
df['Industry'].value_counts()[:30].plot(kind='bar', figsize=(14,8))
plt.xlabel("Industry")
plt.ylabel("Number of job posts");

Most of the data analyst job posts are in the IT and Staffing & Outsourcing sectors. Current data analyst post vacancy in the Health Care sector is almost half of these two.

The next interesting question could be **What is the distribution of salary across all the industries?** Let's look at the column 'Salary Estimate'.

In [None]:
# first 10 entries in 'Salary Estimate'
df['Salary Estimate'][:10]

We need to clean this. We create three new features from this column - minimum, maximum and average salaries offered in different industries and then drop the original feature.

In [None]:
# the minimum salary offered
df['min_salary'] = df['Salary Estimate'].apply(lambda x:float(x.split()[0].split("-")[0].strip("$,K")) 
                                                          if not pd.isnull(x) else x)

# the maximum salary offered
df['max_salary'] = df['Salary Estimate'].apply(lambda x:float(x.split()[0].split("-")[1].strip("$,K")) 
                                                          if not pd.isnull(x) else x)

# the average salary offered
df['avg_salary'] = (df['min_salary'] + df['max_salary'])/2.

# drop the original column
df.drop('Salary Estimate', axis=1, inplace=True)

Now we can look at the distribution of average salary in different industries.

In [None]:
# plot top 30 industries with highest offered salaries
df.groupby('Industry')['avg_salary'].mean().sort_values(ascending=False)[:30].plot(kind='bar', figsize=(14,10))
plt.xlabel('Industry')
plt.ylabel('Average salary');

This is interesting! Although IT sector has maximum number of job postings, the salary offered does not fall into top 30. The healthcare industry ('Drug & Health Stores', 'Health Care Products Manufacturing', 'Biotech & Pharmaceuticals', 'Health Care Services & Hospitals') generally has a high average salary. The average salary offered in education industry is almost comparable to that of healthcare industry. This plot tells us that even if some industries have fewer job postings, they offer better salaries.

**Is there any particular location with high demand of data analysts?**

In [None]:
# the 'Location' column
df['Location']

Let's extract only the state names from the 'Location' column.

In [None]:
df['Job_state'] = df['Location'].apply(lambda x:x.split(",")[-1].strip())
df['Job_state']

In [None]:
# how many unique values?
df['Job_state'].nunique()

In [None]:
# plot total number of job posting in each state
df['Job_state'].value_counts().plot(kind="bar", figsize=(14,8))
plt.xlabel("Job Location")
plt.ylabel("Number of job posts");

Most of the vacancies are in CA followed by TX and NY which have almost equal number of vacancies in the data analyst post.

**Which companies are hiring data analysts actively?** Let's look at the column 'Company Name'.

In [None]:
df['Company Name'].unique()[:20]

In [None]:
# extract only the company name
df['Company Name'] = df['Company Name'].apply(lambda x:x.split("\n")[0].strip() if not pd.isnull(x) else x)

In [None]:
# plot top 30 companies with high job postings
df['Company Name'].value_counts()[:30].plot(kind='bar', figsize=(14,10))
plt.xlabel('Company')
plt.ylabel('Number of job posts');

As we can see from the above plot, 'Staffigo Technical Services' has a large number of openings for data analysts.

**Which companies are offering high salaries?**

In [None]:
# plot top 30 companies with high average salaries
df.groupby('Company Name')['avg_salary'].mean().sort_values(ascending=False)[:30].plot(kind='bar', figsize=(14,10))
plt.xlabel('Company')
plt.ylabel('Average salary');

Let's study these companies closely.

In [None]:
# store top 30 companies offering high salaries in a list
top_30_comps = list(df.groupby('Company Name')['avg_salary'].mean().sort_values(ascending=False)[:30].index)

**How are the ratings of these companies?**

In [None]:
print("The rating is given on a scale {}-{}.".format(df['Rating'].min(), df['Rating'].max()))

In [None]:
# plot ratings of these companies
plt.figure(figsize=(14,8))
sns.barplot(x=df[df['Company Name'].isin(top_30_comps)]['Company Name'], 
            y=df[df['Company Name'].isin(top_30_comps)]['Rating'],
            order = top_30_comps)
plt.xlabel('Company')
plt.xticks(rotation=90);

All the companies (except three) have a minimum rating value 3. 'Xcutives.com Inc', 'Applicantz, Inc' and 'Parsoft LLC' - although these three companies are offering high salaries to candidates, no rating is available for them. One reason might be that these are comparatively new companies.

**What about number of currently employed staffs in these companies?** The relevant feature for this question is 'Size'.

In [None]:
# unique values in the feature 'Size'
df['Size'].unique()

In [None]:
# Employee size of top 30 high paying companies
plt.figure(figsize=(10,8))
sns.countplot(df[df['Company Name'].isin(top_30_comps)]['Size'])   
plt.xlabel('Employee Size')
plt.xticks(rotation=90);

Surprisingly, most of the high paying companies are small in size. These are likely to be start-ups. So the data is telling us that start-ups are likely to pay more to data analysts as compared to big companies.

Another important question to explore could be **what are the financial status of these companies?** To find an answer to this, we look at the available feature 'Revenue'.

In [None]:
# unique values in 'Revenue'
df['Revenue'].unique()

In [None]:
# plot revenues of top 30 high paying companies
plt.figure(figsize=(10,8))
sns.countplot(df[df['Company Name'].isin(top_30_comps)]['Revenue'])    
plt.xticks(rotation=90);

The plot reiterates the same thing - most of the data analyst job positions are available in small to medium sized companies.

One more question in this chain - **where are these companies located?**

In [None]:
# plot locations of these top 30 companies
plt.figure(figsize=(10,8))
sns.countplot(df[df['Company Name'].isin(top_30_comps)]['Location'])    
plt.xlabel("Company Location")
plt.xticks(rotation=90);

All these companies are situated in CA (mainly in the city San Francisco)! CA not only has a large number of openings for data analysts, it also offers high salaries as compared to other states. All the information that we gathered from this exercise can be summarized as follows:

**Companies offering high salaries to data analysts are small in size with average total revenue, generally have decent ratings and are mostly situated in CA.** 

So, if you are looking for a career in data analytics, CA might be you next destination!

To get a job, one must have the required skill sets. **What skills are companies looking for in a data analyst?** The feature of interest in this case is 'Job Description'.

Let's find out average lengths of available job descriptions in the dataset.

In [None]:
desc_len = [len(desc) for desc in df['Job Description']]
plt.figure(figsize=(14,8))
plt.xlabel('Job descripiton length')
plt.hist(desc_len, bins=80, range=(0,4000));

Therefore, in general, job descriptions are quite long. But our interest is in some keywords like 'C++', 'pyhton', 'sql' etc that are closely related to required skills of an applicant for the post of data analyst. We use the library 'spaCy' to extract these keywords from a huge corpus. The steps are as follows:

- We look for all the named entities in the corpus.
- A quick check shows that most of our keywords (skills) of interest are labeled as 'ORG'.

In [None]:
# load the required libraries and create an nlp object
nlp = spacy.load('en_core_web_sm')

In [None]:
# list to store extracted skill keywords
skill_list = []

# feed the entire corpus into batches of 100 samples at a time
for i in range(0,len(df), 100):
    # for the last batch
    if i+np.mod(2253,100)==len(df):
        # combine job descriptions of 100 samples into a single string
        text = " ".join(des for des in df['Job Description'][i:len(df)])
    else :
        text = " ".join(des for des in df['Job Description'][i:i+100])
        
    # process raw text with the nlp object that holds all information about the tokens, their linguistic 
    #features and relationships    
    doc = nlp(text)

    # loop over the named entities
    for entity in set(doc.ents):
        # select entities with label 'ORG'
        if entity.label_ == 'ORG':
            # add to the list
            skill_list.append(entity.text)

In [None]:
# count how many times each entity appears in the list
word_count = Counter(skill_list)
# print the top 100 named entities
word_count.most_common(100)

We have been successful in extracting skills like SQL, Python, ETL etc from the corpus. There are also some unrelated words/phrases like healthcare, data governance etc but overall spaCy has done a good job in extracting relevant skills. The most prefered skill is SQL!

In [None]:
# make a list of actual skills extracted from the corpus
skill_set = ['SQL', 'Python', 'ETL', 'SAS', 'SAP', 'Oracle', 'PowerPoint', 'AWS', 'Microsoft Office',
             'XML', 'PL/SQL', 'AI', 'Spark', 'MS Office', 'ERP', 'Big Data',  'Tableau', 'Hadoop', 
             'JavaScript', 'Azure', 'Perl']

# loop over top 100 extracted skill keywords/phrases
# select skills present in the above list
# add to a dictionary    
skill_count_dict = {skill:count for skill, count in word_count.most_common(100) if skill in skill_set}        
            
# SQL and SQL server basically point to the same thing. Let's combine them into a single key            
skill_count_dict['SQL'] = skill_count_dict['SQL'] + skill_count_dict['PL/SQL']

# remove the other key
del skill_count_dict['PL/SQL']

In [None]:
# create a dataframe with two columns - skills and corresponding counts
skill_count_df = pd.DataFrame(skill_count_dict.items(), columns=['Skill', 'Total Count'])
skill_count_df

In [None]:
# plot how many times a skill appeared in the corpus
skill_count_df.groupby('Skill')['Total Count'].max().plot(kind='bar', figsize=(14,8))
plt.xlabel("Required skills")
plt.ylabel("Total count");