In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud, STOPWORDS

%matplotlib inline

#Read data from csv
df = pd.read_csv("../input/data-analyst-jobs/DataAnalyst.csv",engine="python",encoding="UTF-8")

#Add stopwords and create wordcloud on Job description
text = " ".join(review for review in df["Job Description"])
stopwords = set(STOPWORDS)
stopwords.update(["including","understand","must","use"])
word_cloud = WordCloud(stopwords=stopwords, max_font_size=100, max_words=50, background_color="black", min_font_size=8, width=1300, height=500).generate(text)

sns.set_context(context="paper")
plt.figure(figsize=[14,4],  dpi=150)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
# Remove observations having -1 values 
df = df[df["Revenue"]!="-1"]
df = df[df["Sector"]!="-1"]
df = df[df["Rating"]!=-1]
df = df[df["Founded"]!=-1]

# Replace Unknown  values by mode
df["size"]=df["Size"].replace("Unknown",df["Size"].mode()[0])
df["salary_estimate"]=df["Salary Estimate"].replace("-1",df["Salary Estimate"].mode()[0])
df['easy_apply'] = df['Easy Apply'].str.replace('-1', 'Unknown') # Replace the -1 value to Unknown

# Split the  Salary Range
df['Salary Estimate'] = df['Salary Estimate'].map(lambda x: x.rstrip('(Glassdoor est.)'))
df['Salary Estimate'] = df['Salary Estimate'].str.replace('K', '')
df['Salary Estimate'] = df['Salary Estimate'].str.replace('$', '')
df['min_salary_range'] = df['Salary Estimate'].map(lambda x:x.split("-")[0])
df['max_salary_range'] = df['Salary Estimate'].map(lambda x:x.split("-")[1])

# Convert the datatype to numeric
df['max_salary_range']=pd.to_numeric(df['max_salary_range'])
df['min_salary_range']=pd.to_numeric(df['min_salary_range'])

#Get Location by country
df['job_location_country'] = df['Location'].map(lambda x:x.split(", ")[1])
df['headquarters_country'] = df['Headquarters'].map(lambda x:x.split(", ")[1])

#Drop columns such as Salary Estimate,Unnamed: 0,Job Description
df.drop(["Salary Estimate", "Unnamed: 0","Job Description"], axis='columns', inplace=True)

#Remove unwanted string values
df['revenue'] = df['Revenue'].str.replace('\(USD\)', ' ')
df['revenue'] = df['Revenue'].str.strip()
df['size'] = df['Size'].str.replace('employees', ' ')
df['size'] = df['Size'].str.strip()

df.head()
# sns.pairplot(df, hue="JobLocation_Country")

In [None]:
# Create bins for Rating values
def change(x):
    if x < 0.5:
        return 0
    elif x > 0.5 and x <= 1.5:
        return 1
    elif x > 1.5 and x <= 2.5:
        return 2
    elif x > 2.5 and x <= 3.5:
        return 3
    elif x > 3.5 and x <= 4.5:
        return 4
    else:
        return 5
    
df['rating'] = list(map(change,df['Rating']))

In [None]:
#Convert "size" variable values to numeric
df.Size.nunique()
df["size"]=df.Size.astype('category')
df["size_level"] = df["size"].cat.codes

#Convert"Type of ownership" variable values to numeric
df["Type of ownership"]=df["Type of ownership"].astype('category')
df["Ownership_level"] = df["Type of ownership"].cat.codes

#Create correlation matrix
df_corr_matrix= df.corr()
fig, axes = plt.subplots(figsize=(12,4))
sns.set_context(context="paper", font_scale=2)
sns.heatmap(df_corr_matrix, linecolor="white", linewidth=1)
axes.set_title("Correlation between features")

In [None]:
#Barplot creation for Foundation year Vs Number of employees
fig, axes = plt.subplots(figsize=(12,4))
axes.set_xticklabels([i for i in df.Size], rotation=20, ha="right")  
sns.set_context(context="paper", font_scale=2)
sns.set_style(style="whitegrid")
sns.barplot(x=df["Size"], y=df["Founded"])
                                     
axes.set_xlabel("Number of employees")
axes.set_ylabel("Foundation year")
axes.set_title("Foundation year and number of employees")
plt.ylim(1900,2020)

In [None]:
#Create group by using job location 
by_job_location_country = df.groupby("job_location_country")
job_location_country_count = by_job_location_country.count()
job_location_country_mean = by_job_location_country.mean()

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(15,9), squeeze=False)
sns.set_style(style="darkgrid")
sns.set_context(context="paper", font_scale=2)
axes[0][0].set_xticklabels([])
axes[1][0].set_xticklabels([i for i in job_location_country_mean.index], rotation=90, ha="right")

#Create a lineplot for Salary range Vs job location 
sns.lineplot(job_location_country_mean.index,job_location_country_mean.min_salary_range, color ="#fa1302", linewidth=3, alpha=2, marker = "o", markersize=10, markerfacecolor="green", 
         markeredgewidth=2, markeredgecolor="yellow", label="Min Salary Range", ax=axes[0,0])
sns.lineplot(job_location_country_mean.index,job_location_country_mean.max_salary_range, color ="#15217d", linewidth=3, alpha=2, marker = "o", markersize=10, markerfacecolor="green", 
         markeredgewidth=2, markeredgecolor="yellow", label="Max Salary Range", ax=axes[0,0])

axes[0][0].set_xlabel("")
axes[0][0].set_ylabel("Salary range ($1000 USD)")
axes[0][0].set_title("Changes in salary range over countries")
axes[0][0].legend(loc = (0.7,0.75))


#Create a Barplot for Number of companies Vs job location 
sns.barplot(job_location_country_count.index, job_location_country_count["Company Name"], ax=axes[1,0])

axes[1][0].set_ylabel("Count of companies")
axes[1][0].set_title("Count of companies in the same countries")
axes[1][0].set_xlabel("Country")

plt.tight_layout()

In [None]:
# Create group by using Sector
by_sector = df.groupby("Sector")
sector_mean = by_sector.mean()

fig, axes = plt.subplots(figsize=(20,4))
sns.set_style(style="darkgrid")
sns.set_context(context="paper", font_scale=2)

axes.set_xticklabels([i for i in sector_mean.index], rotation=70, ha="right")

#create a line plots for Industry sector Vs Salary range
sns.lineplot(sector_mean.index,sector_mean.min_salary_range, color ="#fa1302", linewidth=3, alpha=2, marker = "o", markersize=10, markerfacecolor="green", 
         markeredgewidth=2, markeredgecolor="yellow", label="Min Salary Range")
sns.lineplot(sector_mean.index,sector_mean.max_salary_range, color ="#15217d", linewidth=3, alpha=2, marker = "o", markersize=10, markerfacecolor="green", 
         markeredgewidth=2, markeredgecolor="yellow", label="Max Salary Range")

axes.set_xlabel("Sectors")
axes.set_ylabel("Salary range ($1000 USD))")
axes.set_title("Sectorwise distribution of salary")
axes.legend(loc = (0.5,0.72))

In [None]:
sector_count = by_sector.count()

fig, axes = plt.subplots(figsize=(20,4))
sns.set_style(style="darkgrid")
sns.set_context(context="paper", font_scale=2)

axes.set_xticklabels([i for i in sector_count.index], rotation=70, ha="right")

#Create a barplot for sector Vs Count of companies
sns.barplot(sector_count.index, sector_count["Company Name"])
axes.set_xlabel("Sectors")
axes.set_ylabel("Count of companies")
axes.set_title("Number of companies in each sector")

In [None]:
by_revenue = df.groupby("Revenue")
revenue_mean = by_revenue.mean().sort_values(by="Rating") 
revenue_count = by_revenue.count().sort_values(by="Company Name") 
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,10), squeeze=False)

sns.set_style(style="whitegrid")
sns.set_context(font_scale=2)

color= {'Less than $1 million (USD)':'#c497a0','$1 to $5 million (USD)':'#db8495','$5 to $10 million (USD)':'#db5e77',
        '$10 to $25 million (USD)':"#de3c5c",'$25 to $50 million (USD)':'#d61e42','$50 to $100 million (USD)':'#e30b36',
        '$100 to $500 million (USD)':'#fc0334','$500 million to $1 billion (USD)':'#8ebfb5', '$1 to $2 billion (USD)':'#71bdad',
        '$2 to $5 billion (USD)':'#32ad93','$5 to $10 billion (USD)':'#15d1a9','$10+ billion (USD)':'#05fac6',
        'Unknown / Non-Applicable':'#181c1b'
       }

#Create a barplot organization revenue vs organization's ratings
axes[0][0].set_xticklabels([i for i in revenue_mean.index], rotation=70, ha="right")
sns.barplot(x=revenue_mean.index, y=revenue_mean["Rating"],palette=color, ax=axes[0,0])
axes[0][0].set_xlabel("Revenue")
axes[0][0].set_ylabel("Rating")
axes[0][0].set_title("Impact of revenue on ratings")
axes[0][0].set_ylim(3,4.25)

#Create a barplot organization distribution vs organization's revenue
axes[0][1].set_xticklabels([i for i in revenue_count.index], rotation=70, ha="right")
sns.barplot(x=revenue_count.index, y=revenue_count["Company Name"], palette=color, ax=axes[0,1])
axes[0][1].set_xlabel("Revenue")
axes[0][1].set_ylabel("Count of companies")
axes[0][1].set_title("Distribution of companies over revenue")
axes[0][1].set_ylim(5,240)

plt.tight_layout()


In [None]:
by_Job_title = df.groupby("Job Title")
job_title_mean = by_Job_title.mean()

job_title_max_sort=job_title_mean.sort_values(by=['max_salary_range'], ascending=False)
job_title_min_sort=job_title_mean.sort_values(by=['min_salary_range'])

job_title_max_sort_top_20 = job_title_max_sort.iloc[0:20]
job_title_min_sort_bottom_20 = job_title_min_sort.iloc[0:20]

In [None]:
job_title_max_sort_top_20=job_title_max_sort_top_20.reset_index()
job_title_max_sort_top_20["Job Title"]=job_title_max_sort_top_20["Job Title"].astype('category')
job_title_max_sort_top_20["Level_JobTitle"] = job_title_max_sort_top_20["Job Title"].cat.codes

Mapping = dict(enumerate(job_title_max_sort_top_20["Job Title"].cat.categories))
fig, axes = plt.subplots(figsize=(12,5))
sns.set_style(style="darkgrid")
sns.set_context( font_scale=0.5)
sns.barplot(x=job_title_max_sort_top_20.Level_JobTitle, y=job_title_max_sort_top_20["max_salary_range"])
axes.set_title("Top 20 job titles gaining maximum salary ")
axes.set_xlabel("Jot title Level")
axes.set_ylabel("Salary ($1000 USD)")
axes.set_ylim(140,200)
plt.tight_layout()
print(Mapping)


In [None]:
job_title_min_sort_bottom_20=job_title_min_sort_bottom_20.reset_index()

job_title_min_sort_bottom_20["Job Title"]=job_title_min_sort_bottom_20["Job Title"].astype('category')
job_title_min_sort_bottom_20["Level_JobTitle"] = job_title_min_sort_bottom_20["Job Title"].cat.codes

Mapping = dict(enumerate(job_title_min_sort_bottom_20["Job Title"].cat.categories))

fig, axes = plt.subplots(figsize=(12,5))
sns.set_style(style="darkgrid")
sns.set_context( font_scale=0.5)
# axes.set_xticklabels([i for i in JobTitle_minSort_bottom20.Level_JobTitle], rotation=70, ha="right")
sns.barplot(x=job_title_min_sort_bottom_20.Level_JobTitle, y=job_title_min_sort_bottom_20["min_salary_range"])

axes.set_xlabel("Jot title level")
axes.set_ylabel("Salary ($1000 USD)")
axes.set_title("Bottom 20 Job title with minimum salary")
axes.set_ylim(22,28)
plt.tight_layout()
print(Mapping)