In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Insights Sought

In this notebook, I will be exploring and cleaning the dataset while trying to gain some insight on the following questions:
1. Where are the companies for a particular sector majorly located implying major presence of that industry in that state?
2. Is there a relation between company revenue, size and salary?
3. Is there a relationship between company rating and salary?
4. Relation between type of ownership and salary?

In [None]:
dataset = pd.read_csv('/kaggle/input/data-analyst-jobs/DataAnalyst.csv', index_col = 0)
dataset.head(3)

In [None]:
dataset.shape

In [None]:
dataset.info()

Dropping Null Values

In [None]:
dataset.isna().sum()

In [None]:
dataset.dropna(inplace = True)
dataset.isna().sum()

Deleting columns I will not be working with

In [None]:
dataset.drop(['Job Title', 'Job Description', 'Founded', 'Headquarters', 'Industry', 'Competitors', 'Easy Apply'], inplace = True, axis = 1)
dataset.head(1)

# Data Wrangling for Q1

In [None]:
dataset_q1 = dataset.copy()
dataset_q1 = dataset[['Sector','Location']]

In [None]:
dataset_q1['Location'].unique()

In [None]:
dataset_q1['Location'] = dataset_q1['Location'].str[-2:]
dataset_q1['Location'].unique()

In [None]:
dataset_q1['Sector'].unique()

In [None]:
dataset_q1 = dataset_q1.replace({'Sector': {'-1': 'Unknown'}})
dataset_q1['Sector'].unique()

In [None]:
dataset_q1 = dataset_q1.groupby(by = ['Location', 'Sector'], as_index = False).size()
dataset_q1

In [None]:
data_q1 = dataset_q1.set_index(['Location', 'Sector'])['size']
ax = data_q1.unstack().plot(kind='barh', legend = True, figsize = (10,15), stacked = True, cmap = 'tab20b')
ax.set_ylabel('States')
ax.set_xlabel('Number of companies')
ax.set_title('Graph showing presence of companies according to sector in each state')

From above plot, we can see that Business Service and IT companies abound in Texas. In New York and California too Business Service companies are there. In California, IT companies are also present 

# Data Wrangling for Q2

In [None]:
dataset_q2 = dataset[['Revenue', 'Size', 'Salary Estimate', 'Sector']]

In [None]:
dataset_q2 = dataset_q2[dataset_q2['Sector'] != '-1']
len(dataset_q2)

In [None]:
dataset_q2['Revenue'].unique()
dataset_q2 = dataset_q2[dataset_q2['Revenue'] != 'Unknown / Non-Applicable']
len(dataset_q2)

In [None]:
dataset_q2['Size'].unique()
dataset_q2 = dataset_q2[dataset_q2['Size'] != 'Unknown']
len(dataset_q2)

In [None]:
dataset_q2['Sector'].unique()
dataset_q2 = dataset_q2[dataset_q2['Sector'] != 'Unknown']
len(dataset_q2)

In [None]:
# Getting median salary
salary_split = dataset['Salary Estimate'].str.split("-" , expand = True)
dataset_q2['Salary Estimate'] = (pd.to_numeric(salary_split[0].str.extract('(\d+)' , expand = False)) +  \
                                 pd.to_numeric(salary_split[1].str.extract('(\d+)' , expand = False)) ) / 2
dataset_q2['Salary Estimate']

In [None]:
dataset_q2['Size'].unique()
for i in range(len(dataset_q2)):
    if len(dataset_q2['Size'].iloc[i].split(' ')) > 2:
        dataset_q2['Size'].iloc[i] = dataset_q2['Size'].iloc[i].split(' ')[2]
    else:
        dataset_q2['Size'].iloc[i] = '10000+'
dataset_q2['Size'], dataset_q2['Size'].unique()

In [None]:
dataset_q2['Revenue'].unique()
dataset_q2['Revenue'] = dataset_q2['Revenue'].str[:-5]
dataset_q2['Revenue']

In [None]:
dataset_q2_1 = dataset_q2[['Revenue', 'Salary Estimate']]

for i in range(len(dataset_q2_1)):
    if dataset_q2_1['Salary Estimate'].iloc[i] <= 50:
        dataset_q2_1['Salary Estimate'].iloc[i] = 'Less than 50k'
    elif dataset_q2_1['Salary Estimate'].iloc[i] > 50 and dataset_q2_1['Salary Estimate'].iloc[i] <= 100:
        dataset_q2_1['Salary Estimate'].iloc[i] = '50k - 100k'
    else:
        dataset_q2_1['Salary Estimate'].iloc[i] = 'Greater than 100k'


dataset_q2_1.groupby(by = ['Revenue', 'Salary Estimate'], as_index = False).size().head(5)

In [None]:
data_q2_1 = dataset_q2_1.groupby(by = ['Revenue', 'Salary Estimate'], as_index = False).size()
data_q2_1 = data_q2_1.set_index(['Revenue', 'Salary Estimate'])['size']
ax = data_q2_1.unstack().plot(kind='bar', legend = True, figsize = (5,5), cmap = 'tab20b')
ax.set_ylabel('Number of companies')
ax.set_title('Salary estimates according to company revenue')

From above plot, we can see that company revenue is not that much of a deciding factor regarding the salaries they offer for data science jobs as the graph is more or less at the same level

In [None]:
dataset_q2_2 = dataset_q2[['Size', 'Salary Estimate']]

for i in range(len(dataset_q2_2)):
    if dataset_q2_2['Salary Estimate'].iloc[i] <= 50:
        dataset_q2_2['Salary Estimate'].iloc[i] = 'Less than 50k'
    elif dataset_q2_2['Salary Estimate'].iloc[i] > 50 and dataset_q2_2['Salary Estimate'].iloc[i] <= 100:
        dataset_q2_2['Salary Estimate'].iloc[i] = '50k - 100k'
    else:
        dataset_q2_2['Salary Estimate'].iloc[i] = 'Greater than 100k'


dataset_q2_2.groupby(by = ['Size', 'Salary Estimate'], as_index = False).size().head(5)

In [None]:
data_q2_2 = dataset_q2_2.groupby(by = ['Size', 'Salary Estimate'], as_index = False).size()
data_q2_2 = data_q2_2.set_index(['Size', 'Salary Estimate'])['size']
ax = data_q2_2.unstack().plot(kind='bar', legend = True, figsize = (5,5), cmap = 'tab20b')
ax.set_xlabel('Size of company')
ax.set_ylabel('Number of companies')
ax.set_title('Salary estimates related to size of the company')

Again, we can see that salary is not decided by company size

# Data Wrangling for Q3

In [None]:
dataset_q3 = dataset[['Rating', 'Salary Estimate', 'Revenue']]
dataset_q3 = dataset_q3[dataset_q3['Rating'] != -1]
dataset_q3 = dataset_q3[dataset_q3['Revenue'] != 'Unknown / Non-Applicable']

salary_split = dataset_q3['Salary Estimate'].str.split("-" , expand = True)
dataset_q3['Salary Estimate'] = (pd.to_numeric(salary_split[0].str.extract('(\d+)' , expand = False)) +  \
                                 pd.to_numeric(salary_split[1].str.extract('(\d+)' , expand = False)) ) / 2
for i in range(len(dataset_q3)):
    if dataset_q3['Salary Estimate'].iloc[i] <= 50:
        dataset_q3['Salary Estimate'].iloc[i] = 'Less than 50k'
    elif dataset_q3['Salary Estimate'].iloc[i] > 50 and dataset_q3['Salary Estimate'].iloc[i] <= 100:
        dataset_q3['Salary Estimate'].iloc[i] = '50k - 100k'
    else:
        dataset_q3['Salary Estimate'].iloc[i] = 'Greater than 100k'
        
dataset_q3['Revenue'] = dataset_q3['Revenue'].str[:-5]
        
dataset_q3.head(3), len(dataset_q3)

In [None]:
dataset_q3['Rating'] = round(dataset_q3['Rating'])

In [None]:
data_q3 = dataset_q3.groupby(by = ['Rating', 'Salary Estimate'], as_index = False).size()
data_q3 = data_q3.set_index(['Rating', 'Salary Estimate'])['size']
ax = data_q3.unstack().plot(kind='barh', legend = True, figsize = (5,5), cmap = 'tab20b')
ax.set_xlim(0,400)
ax.set_ylabel('Ratings (rounded)')
ax.set_xlabel('Number of companies')
ax.set_title('Salary estimates for differently rated companies')

We can see that there are companies rated around 3 and 4 even when they give greater higher paying jobs. As such, the companies rated around 5 give more number of average paying jobs. We cannot say for sure that ratings are due to salary 

# Data Wrangling for Q4

In [None]:
dataset_q4 = dataset[['Type of ownership', 'Salary Estimate']]
dataset_q4 = dataset_q4[(dataset_q4['Type of ownership'] != 'Unknown') & (dataset_q4['Type of ownership'] != '-1')]

salary_split = dataset_q4['Salary Estimate'].str.split("-" , expand = True)
dataset_q4['Salary Estimate'] = (pd.to_numeric(salary_split[0].str.extract('(\d+)' , expand = False)) +  \
                                 pd.to_numeric(salary_split[1].str.extract('(\d+)' , expand = False))) 
for i in range(len(dataset_q4)):
    if dataset_q4['Salary Estimate'].iloc[i] <= 50:
        dataset_q4['Salary Estimate'].iloc[i] = 'Less than 50k'
    elif dataset_q4['Salary Estimate'].iloc[i] > 50 and dataset_q4['Salary Estimate'].iloc[i] <= 100:
        dataset_q4['Salary Estimate'].iloc[i] = '50k - 100k'
    else:
        dataset_q4['Salary Estimate'].iloc[i] = 'Greater than 100k'
dataset_q4.head(3)

In [None]:
data_q4 = dataset_q4.groupby(by = ['Type of ownership', 'Salary Estimate'], as_index = False).size()
data_q4 = data_q4.set_index(['Type of ownership', 'Salary Estimate'])['size']
ax = data_q4.unstack().plot(kind='barh', legend = True, figsize = (5,5), cmap = 'tab20b')
ax.set_xlim(0,400)
ax.set_xlabel('Number of companies')
ax.set_title('Salary estimates for differently owned companies')

We can see that smaller firms and entities not traditionally considered commercial (like schools, colleges, franchises and NGOs) have lesser offers in total as well as less offers with higher pay