## Exploratory Data Analysis

First import all the libraries we need and read the dataset:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import anderson
from scipy.stats import normaltest
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import re
import warnings
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
warnings.filterwarnings('ignore')
%matplotlib inline

data = pd.read_csv('../input/data-scientist-jobs/DataScientist.csv')
data.head()

It looks like we don't need columns: "Unnamed: 0" and "index"

In [None]:
data = data.drop('Unnamed: 0', 1)
data = data.drop('index', 1)

print(data.shape)
print(data.columns)

I see that in some columns we have "-1" value, which can be interpreted as null value, so let's check how many "null" values we have in every column

In [None]:
def count_missing_values():
    for column in data:
        nullAmount = None
        if (is_numeric_dtype(data[column])):
            nullAmount = data[data[column] == -1].shape[0]
        else:
            nullAmount = data[data[column] == "-1"].shape[0]
        print('{}{},  \t{:2.1f}%'.format(column.ljust(20),nullAmount, nullAmount*100/data[column].shape[0]))
    
count_missing_values()

We can see that columns like "Competitors" and "Easy Apply" has 70.6% and 95.8% of null-values, so we can just delete this columns.

In [None]:
data = data.drop('Competitors', 1)
data = data.drop('Easy Apply', 1)

We can fill missing values for the "Rating" column using interpolation of the values we already had. I think we can just remove rows with missing values from columns like "Headquarters", "Size", "Type of ownership" and "Revenue". Let's do it and take one more looks at how many missing values we still have:

In [None]:
data = data.replace(-1, np.nan)
data["Rating"].interpolate(method='linear', direction = 'forward', inplace=True) 

data.drop(data[data['Headquarters'] == "-1"].index, inplace=True)
data.drop(data[data['Size'].str.contains("-1")].index, inplace=True)
data.drop(data[data['Type of ownership'].str.contains("-1")].index, inplace=True)
data.drop(data[data['Revenue'].str.contains("-1")].index, inplace=True)
print(data.shape)
count_missing_values()

Now we have only 8.4% of missing values in columns "Industry" and "Sector", it's relatively small value so I think we can also just remove this rows. 

In [None]:
data.drop(data[data['Sector'].str.contains("-1")].index, inplace=True)
data.drop(data[data['Industry'].str.contains("-1")].index, inplace=True)
print(data.shape)
count_missing_values()

So now we have filled dataframe with 13 columns and 3356 rows. Let's take a look at what we have there, let's do the EDA.

First let's see which job positions we have:

In [None]:
data['Job Title'].value_counts()

Let's make analysis for the "Data Scientist" jobs:

In [None]:
data =  data[data['Job Title'].str.contains("Data Scientist") | data['Job Title'].str.contains("Data Analyst")]
print(data.shape)

Next let's parse 'Salary Estimate' column to 'SalaryAverage' one:

In [None]:
HOURS_PER_WEEK = 40
WEEKS_PER_YEAR = 52
THOUSAND = 1000

def return_digits(x):
    result = re.findall(r'\d+', str(x))
    result = int(result[0]) if result else 0
    return result

def return_salary(string, isFrom):
    patternMain = None
    patternPerHour = None
    if(isFrom):
        patternMain = r'^\$\d+K';
        patternPerHour = r'^\$\d+';
    else:
        patternMain = r'-\$\d+K';
        patternPerHour = r'-\$\d+';
    
    result = None
    if('Per Hour' in string):
        result = re.findall(patternPerHour, str(string))
        result = return_digits(result[0]) if result else 0
        result = result * HOURS_PER_WEEK * WEEKS_PER_YEAR
    else:
        result = re.findall(patternMain, str(string))
        result = return_digits(result[0]) if result else 0
        result = result * THOUSAND
    return result

def return_average_salary(x):
    from_salary = return_salary(x, True)
    to_salary = return_salary(x, False)
    result = (from_salary+to_salary)/2
    return result

data['SalaryAverage'] =  data['Salary Estimate'].apply(return_average_salary)
print(data['SalaryAverage'].describe())
print(sns.distplot(data['SalaryAverage']))

The average salary distribution has 2 peaks, lets' try to see what can explain this peaks:

In [None]:
#SalaryAverage/Rating plot
print(sns.pairplot(x_vars=["Rating"], y_vars=["SalaryAverage"],data=data,  size=5))

In [None]:
#SalaryAverage/Sector plot
print(sns.pairplot(x_vars=["SalaryAverage"], y_vars=["Sector"],data=data,  size=5))

In [None]:
#SalaryAverage/Location plot
print(sns.pairplot(x_vars=["Location"], y_vars=["SalaryAverage"],data=data,  size=5))

The plots above shows us that this average salary distribution can be explained by location. Let's check it. First, let's get states or country where jobs are located:

In [None]:
def return_state(string):
    patternMain = r',\s[A-Z]{2}';    
    result = re.findall(patternMain, str(string))
    if result:
        result = re.findall(r'[A-Z]{2}', str(result[0]))[0]
    else:
        result = string.split(r', ')[1]
    return result

data['State'] =  data['Location'].apply(return_state)
print(data['State'].head())
print(data['State'].value_counts())
print(sns.pairplot(x_vars=["SalaryAverage"], y_vars=["State"],data=data,  size=5))

The plot above shows that most of the jobs position in the states like NY, NJ, CA are probably spread around bigger salary then in other places. Let's split data by state and see how our distribution will change. For example, let's take a look at average salary distribution in the states mentioned above:

In [None]:
dataBiggerSalary = data[data['State'].isin(['NY', 'NJ', 'CA'])] 
print(sns.distplot(dataBiggerSalary['SalaryAverage'], fit=norm))
print(dataBiggerSalary.shape)

In [None]:
from scipy.stats import norm, expon, cauchy
dataSmallerSalary = data[~data['State'].isin(['TX', 'NY', 'NJ', 'CA'])] 
print(dataSmallerSalary.shape)
print(sns.distplot(dataSmallerSalary['SalaryAverage']))

The average salary distribution in other states then NY, NJ and CA, looks different then normal, more like It looks like the average salary in the NY, NJ and CA states has an approximately bell-shape and can be  normal distributed, but it also could have an outliers. Let's check the outliers in the average salary in the NY, NJ and CA: 

In [None]:
print(sns.pairplot(x_vars=["SalaryAverage"], y_vars=["State"],data=dataBiggerSalary,  size=5))
print(dataBiggerSalary.boxplot(by ='State', column =['SalaryAverage']))
print(dataBiggerSalary["SalaryAverage"].describe())

We can see that the salary less then 75000 can be outliers. Let's remove it:

In [None]:
dataBiggerSalary.drop(dataBiggerSalary[dataBiggerSalary['SalaryAverage'] < 75000].index, inplace=True)
print(dataBiggerSalary.shape)
print(sns.distplot(dataBiggerSalary['SalaryAverage'], fit=norm))

Let's check normality with Shapiro Wilk test (as it's the most powerful test when testing for a normal distribution):

In [None]:
def testNormality(data):
    stat, p = shapiro(data)
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    alpha = 0.05
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')
        
testNormality(dataBiggerSalary['SalaryAverage'])

So the results show that the average salary in this states (NY, NJ and CA) is not normal distributed.

Let's see maybe we can find some interesting and usefull information from the data other then salary in our dataset: 

In [None]:
print(data.columns)
print(sns.countplot(y='Sector',data=data, order = data['Sector'].value_counts().index))

In [None]:
print(sns.countplot(y='State',data=data, order = data['State'].value_counts().index))

In [None]:
print(sns.countplot(y='Size',data=data, order = data['Size'].value_counts().index))
print(data["Company Name"].value_counts())

In [None]:
plt.figure(figsize=(15,16))
print(sns.countplot(x='Rating',data=data, order = data['Rating'].value_counts().index))

From the plots above we can see that most of the job positions is in IT and Business Services sectors, from companies like IBM, Amazon, Apple and Facebook and located in the states like CA and TX.