In [None]:
import re
import numpy as np 
import pandas as pd 
import plotly_express as px

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/data-analyst-jobs/DataAnalyst.csv')
data.sample(5)

# Basic Exploration and Wrangling

In [None]:
data.info()

In [None]:
data.shape[0] - data.count() # one missing value in the 'Company Name' column

In [None]:
data[data['Company Name'].isnull()] # to drop off this data point

In [None]:
data.dropna(subset=['Company Name'], inplace=True) 

In [None]:
# to extract the salary information from the 'Salary Estimate' column
pattern = re.compile('\d+')
data['salary_estimates_extract'] = data['Salary Estimate'].apply(lambda x : pattern.findall(str(x)))

data['salary_lower_band'] = data['salary_estimates_extract'].apply(lambda x : x[0])
data['salary_upper_band'] = data['salary_estimates_extract'].apply(lambda x : x[-1])

data['salary_lower_band'] = data['salary_lower_band'].astype('int32')
data['salary_upper_band'] = data['salary_upper_band'].astype('int32')

data['est_median_salary'] = (data['salary_upper_band'] + data['salary_lower_band']) / 2

In [None]:
data['Company Name'] = data['Company Name'].str.split('\n').apply(lambda x : x[0])
data['company_years_of_exp'] = 2020 - data['Founded']
data['State'] = data['Location'].apply(lambda x: x[-2:])

In [None]:
print('This is the prep dataset to be used for analysis and visualization')
data.head(5)

# Data Insights

In [None]:
data_ = data[data['Industry'] != '-1']
x = data_.groupby('Industry').agg({'est_median_salary': np.mean, 'salary_lower_band': min, 'salary_upper_band': max}).\
        rename(columns={'est_median_salary':'Avg. Median Salary', 'salary_lower_band': 'Lowest Salary', 'salary_upper_band': 'Highest Salary'}).reset_index().round(1)

print('Top 10 Average Estimated Median Salary by Industry')
print()
print(x.sort_values(by='Avg. Median Salary', ascending=False).head(10).to_string(index=False))

In [None]:
data_ = data[data['Industry'] != '-1']
x = data_.groupby('Industry').agg({'est_median_salary': np.mean, 'salary_lower_band': min, 'salary_upper_band': max}).\
        rename(columns={'est_median_salary':'Avg. Median Salary', 'salary_lower_band': 'Lowest Salary', 'salary_upper_band': 'Highest Salary'}).reset_index().round(1)

print('Top 10 Highest Salary by Industry')
print()
print(x.sort_values(by= 'Highest Salary', ascending=False).head(10).to_string(index=False))

In [None]:
x = data_.groupby('State').agg({'est_median_salary': np.mean, 'salary_lower_band': min, 'salary_upper_band': max}).\
        rename(columns={'est_median_salary':'Avg. Median Salary', 'salary_lower_band': 'Lowest Salary', 'salary_upper_band': 'Highest Salary'}).reset_index().round(1)

print('Top 10 Average Estimated Median Salary by State')
print()
print(x.sort_values(by='Avg. Median Salary', ascending=False).head(10).to_string(index=False))

In [None]:
x = data_.groupby('Location').agg({'est_median_salary': np.mean, 'salary_lower_band': min, 'salary_upper_band': max}).\
        rename(columns={'est_median_salary':'Avg. Median Salary', 'salary_lower_band': 'Lowest Salary', 'salary_upper_band': 'Highest Salary'}).reset_index().round(1)

print('Top 10 Average Estimated Median Salary by Location')
print()
print(x.sort_values(by='Avg. Median Salary', ascending=False).head(10).to_string(index=False))

# Relationship between Ratings and Salary

In [None]:
data_ = data[data['Rating']  != -1]
fig = px.scatter(data_, x= 'Rating', y= 'est_median_salary', trendline='ols', hover_data = ['Job Title', 'Location', 'Industry', 'Founded'], marginal_x= 'histogram')


fig.update_layout(title='Relationship between Rating and Salary')
fig.update_xaxes(showgrid = False)
fig.update_yaxes(showgrid = False)

print('Observations:')
print('#1. Ratings does not seems to have an impact on Salary')
print('#2. Most ratings are between 3.5 to 4')
fig.show()