# Name : Jay Shah
# Date : 05-08-2021
## San Francisco Salary Analysis

In [None]:
!pip install probscale

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from pylab import *
from scipy import stats
import probscale
import plotly.offline as pyo
pyo.init_notebook_mode()

In [None]:
df = pd.read_csv('/kaggle/input/sf-salaries/Salaries.csv')
df

In [None]:
df.head(15)

In [None]:
df.tail(15)

#### Printing the name of columns

In [None]:
df.columns

In [None]:
print("Total number of rows in dataset: ",df.shape[0])
print("Total number of columns in dataset: ",df.shape[1])

#### Checking the datatypes

In [None]:
df.dtypes

#### Checking the null values in the dataset

In [None]:
df.isnull().sum(axis=0)

#### From the above output it is clearly apparent that all the values of Notes column are empty, majority values of Status column are empty and hence we can drop that column for further analysis. Also, there is no requirement of Id column and therefore we can remove it also.

In [None]:
df.drop(columns=['Notes','Id'],inplace=True)
df

#### From the above dataframe, it is visible that there are some rows in which 'Not Provided' is being entered which is equal to NA. Hence we need to remove those rows.

In [None]:
df = df[df['EmployeeName'] != 'Not provided']
df = df[df['JobTitle'] != 'Not provided']
df.reset_index(inplace = True, drop = True)
df

#### Carrying out the statistical analysis on integer columns

In [None]:
df.describe()

#### Checking the unique values in dataset

In [None]:
print('Total number of unique values in Job-Title column are:',df['JobTitle'].nunique())
print('Total number of unique values in Status column are:',df['Status'].nunique())
print('Total number of unique values in Agency column are:',df['Agency'].nunique())
print('Total number of unique values in Year column are:',df['Year'].nunique())

In [None]:
print("Job-Title:",df['JobTitle'].unique())
print("Status:",df['Status'].unique())
print("Agency:",df['Agency'].unique())
print("Year:",df['Year'].unique())

#### Top-10 jobs in which there are majority of the people

In [None]:
df['JobTitle'] = df['JobTitle'].str.lower()
value_counts = df['JobTitle'].value_counts().head(10)
df_value_counts = pd.DataFrame(value_counts)
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['Job Title', 'Number of people in that job']
df_value_counts['Job Title'] = df_value_counts['Job Title'].str.capitalize()
df_value_counts

In [None]:
plt.figure(figsize=(22,10))
values_column = df_value_counts.loc[:,'Number of people in that job']
values = values_column.values
print(values)
colors = ["#0DFF04","#04FFCD","#FF0B04","#4374B3","#FF9B04","#FA0FB4","#9b59b6","#006a4e","#393166","#D5FF04"]
sns.barplot(x=df_value_counts['Job Title'], y=df_value_counts['Number of people in that job'], palette=colors)
plt.title('Top-10 jobs in which there are majority of the people')
plt.show()

#### Percentage of people working in top-10 jobs

In [None]:
import plotly.express as px
fig = px.pie(df_value_counts, values=values, names='Job Title')
fig.show()

In [None]:
plt.figure(figsize=(16,8))
colors = ["#FF0B04", "#4374B3","#FF9B04",'#FA0FB4']
#sns.set_palette(sns.color_palette(colors))
sns.scatterplot(x=df['TotalPay'], y=df['TotalPayBenefits'], hue=df['Year'],palette=colors)
plt.title('Year-Wise Scatter Plot of Total Pay Benefits vs Total Pay')
plt.show()

## Dist-Plot for continous variables TotalPay and TotaplPayBenefits

In [None]:
plt.figure(figsize=(16,8))
x = df['TotalPay']
subplot(2,2,1)
ax = sns.distplot(x)

subplot(2,2,2)
ax = sns.distplot(x, rug=False, hist=False)

subplot(2,2,3)
ax = sns.distplot(x, vertical=True)

subplot(2,2,4)
ax = sns.kdeplot(x, shade=True, color="r")

plt.show()

In [None]:
plt.figure(figsize=(16,8))
x = df['TotalPayBenefits']
subplot(2,2,1)
ax = sns.distplot(x)

subplot(2,2,2)
ax = sns.distplot(x, rug=False, hist=False)

subplot(2,2,3)
ax = sns.distplot(x, vertical=True)

subplot(2,2,4)
ax = sns.kdeplot(x, shade=True, color="r")

plt.show()

## Box-Plot for continous variables TotalPay and TotaplPayBenefits

In [None]:
plt.figure(figsize=(16,8))
interim_df = df[['TotalPay','TotalPayBenefits']]
interim_df = interim_df.melt(var_name='IncomeType',value_name='Income')
colors = ["#9b59b6","#006a4e"]
sns.boxplot(x="IncomeType",y = "Income",data = interim_df, palette=colors)
plt.show()

## Quantile Plots for continous variables TotalPay and TotalPayBenefits

#### Quantile Plots: Quantile plots are similar to propbabilty plots. The main differences is that plotting positions are converted into quantiles or Z-scores based on a probability distribution. The default distribution is the standard-normal distribution. 

In [None]:
position,totalpay = probscale.plot_pos(df['TotalPay'])
quantile = stats.norm.ppf(position)

fig, ax = plt.subplots(figsize=(16, 8))
ax.plot(quantile, totalpay, marker='.', linestyle='none', label='Total Pay')
ax.set_xlabel('Normal Quantiles')
ax.set_ylabel('Total-Pay')
ax.set_yscale('log')
sns.despine()

In [None]:
position,totalpaybenefits = probscale.plot_pos(df['TotalPayBenefits'])
quantile = stats.norm.ppf(position)

fig, ax = plt.subplots(figsize=(16, 8))
ax.plot(quantile, totalpaybenefits, marker='.', linestyle='none', label='Total Pay Benefits')
ax.set_xlabel('Normal Quantiles')
ax.set_ylabel('TotalPay-Benefits')
ax.set_yscale('log')
sns.despine()

## Violin Plot of continous variables TotalPay and TotalPayBenefits

In [None]:
plt.figure(figsize=(16,8))
colors = ["#FF0B04"]
sns.violinplot(x=df["TotalPay"], palette=colors)
plt.show()

In [None]:
plt.figure(figsize=(16,8))
colors = ["#17EB10"]
sns.violinplot(x=df["TotalPayBenefits"], palette=colors)
plt.show()

#### Number of people doing job each year

In [None]:
value_counts = df['Year'].value_counts()
df_value_counts = pd.DataFrame(value_counts)
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['Year', 'Number of people doing job']
df_value_counts

In [None]:
plt.figure(figsize=(16,8))
values_column = df_value_counts.loc[:,'Number of people doing job']
values = values_column.values
print(values)
colors = ["#0DFF04","#04FFCD","#FF0B04","#4374B3"]
sns.barplot(x=df_value_counts['Year'], y=df_value_counts['Number of people doing job'], palette=colors)
plt.title('Number of people doing job each year')
plt.show()

#### From the above graph, we can conclude that number of people doing job is increasing substantially eaah year.

In [None]:
import plotly.express as px
fig = px.pie(df_value_counts, values=values, names='Year')
fig.show()