In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load the data

In [None]:
df = pd.read_csv("../input/data-science-job-posting-on-glassdoor/Cleaned_DS_Jobs.csv")
df.head()

In [None]:
# Function for getting unique col counts
def get_unique_col_count(col):
    return "There are " + str(df[col].nunique()) + " unique values for " + col

## Overview

In [None]:
df.info()

- **There are 660 rows and 27 columns of data. There aren't any nulls.**
- **There's 1 float, 12 int and 14 object columns.**

In [None]:
# Get an idea of the number of unique values
uniques = df.nunique().reset_index()
uniques.columns = ['Column','Unique Values']
fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(15,5))
sns.set_theme(style="darkgrid")
ax = sns.barplot(y='Column', x='Unique Values', data=uniques)
plt.xticks(rotation=45);

## Univariate analysis

Let's take a look at each variable and get an idea of their distributions.

In [None]:
# Job Title
print(get_unique_col_count('Job Title'))

In [None]:
# Salary Estimate
print(get_unique_col_count('Salary Estimate'))

fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(15,5))
sns.histplot(ax=ax, data=df['Salary Estimate']);
plt.xticks(rotation=45);

In [None]:
# Job Description
print(get_unique_col_count('Job Description'))

In [None]:
# Rating
print(get_unique_col_count('Rating'))
sns.histplot(df['Rating'], kde=True);

In [None]:
# Company Name
print(get_unique_col_count('Company Name'))

In [None]:
# Location
print(get_unique_col_count('Location'))

# Split the state from the rest of the text
split_data = df["Location"].str.rsplit(", ", 1)
data = split_data.to_list()

locations = pd.DataFrame(data, columns=["city","state"])
print(str(locations['state'].nunique()) + " states and " + str(locations['city'].nunique()) + " cities are represented")

fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(15,5))
sns.countplot(x='state', data=locations, order = locations['state'].value_counts().index);
ax.set(title='Job listings by state');

In [None]:
dupe_rows = locations.groupby(locations.columns.tolist(),as_index=False).size()
dupe_rows = dupe_rows[dupe_rows['size'] > 1]

print(str(len(dupe_rows)) + " cities have more than one listing, with the median count per city being " + str(dupe_rows['size'].median()))

In [None]:
# Headquarters
print(get_unique_col_count('Headquarters'))

In [None]:
# Size
print(get_unique_col_count('Size'))

#sns.catplot(x='Size', y='Type of ownership', data=df);
#plt.xticks(rotation=270);

sns.countplot(y='Size',data=df,order = df['Size'].value_counts().index);

In [None]:
# Type of ownership
print(get_unique_col_count('Type of ownership'))
sns.countplot(y='Type of ownership',data=df, order= df['Type of ownership'].value_counts().index);

- **Most of the companies are private companies.**

In [None]:
# Industry
print(get_unique_col_count('Industry'))
fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(5,15))
sns.countplot(y='Industry',data=df, order= df['Industry'].value_counts().index);

- **The majority of listing are for technology companies. No surprise here.**

In [None]:
# Sector
print(get_unique_col_count('Sector'))
sns.countplot(y='Sector',data=df, order= df['Sector'].value_counts().index);

- **Again, the technology sector dominates.**

In [None]:
# Revenue
print(get_unique_col_count('Revenue'))
sns.countplot(y='Revenue',data=df, order= df['Revenue'].value_counts().index);

- **A large percentage of Revenue is unknown, otherwise it looks well distributed.**

In [None]:
# min_salary, avg_salary and max_salary
print(get_unique_col_count('min_salary'))
print(get_unique_col_count('avg_salary'))
print(get_unique_col_count('max_salary'))

fig, ax = plt.subplots(nrows=1, ncols= 3, figsize=(15,5))

sns.boxplot(data=df['min_salary'].unique(), ax=ax[0], orient="v");
sns.boxplot(data=df['avg_salary'].unique(), ax=ax[1], orient="v");
sns.boxplot(data=df['max_salary'].unique(), ax=ax[2], orient="v");

ax[0].set(title='Min Salary');
ax[1].set(title='Avg Salary');
ax[2].set(title='Max Salary');

- **There are a few outliers in the salary columns.**

In [None]:
# job_state
print(get_unique_col_count('job_state'))
fig, ax = plt.subplots(nrows=1, ncols= 1, figsize=(15,5))
sns.countplot(x='job_state',data=df, order= df['job_state'].value_counts().index);
ax.set(title='Jobs by state');

In [None]:
# same_state
print(get_unique_col_count('same_state'))
sns.displot(data=df['same_state']);

In [None]:
# company_age
print(get_unique_col_count('company_age'))
sns.displot(data=df['company_age'], kde=True);

In [None]:
# excel, hadoop, spark, aws, tableau, big_data

fig, ax = plt.subplots(nrows=1, ncols= 6, figsize=(15,3))
sns.histplot(data=df['excel'], ax=ax[0]);
sns.histplot(data=df['hadoop'], ax=ax[1]);
sns.histplot(data=df['spark'], ax=ax[2]);
sns.histplot(data=df['aws'], ax=ax[3]);
sns.histplot(data=df['tableau'], ax=ax[4]);
sns.histplot(data=df['big_data'], ax=ax[5]);
fig.tight_layout()

In [None]:
# job_simp
print(get_unique_col_count('job_simp'))
sns.displot(data=df['job_simp'], kde=True);
plt.xticks(rotation=45);

In [None]:
# seniority
sns.displot(df['seniority']);

## Multivariate analysis

In [None]:
# Look at the correlations between features
df.corr()

In [None]:
# Put the correlations into a heatmap to better visualize
fig,ax = plt.subplots(figsize=(10, 10))   
sns.heatmap(df.corr(), ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap="magma");

In [None]:
# How skewed is each column
df.skew()

In [None]:
# Look at a pairplot and see if anything stands out
sns.pairplot(data=df);

- **There doesn't seem to be much correlation between most of the features. With the exception of the salaries and skills.**