<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list"  role="tab" aria-controls="home">Table of contents</h3>

* [Introduction](#intro)
* [Data and Preprocessing](#data)
* [1. Top 15 Companies](#1)
* [2. Top 15 Job titles](#2)
* [3. Top Industry](#3)
* [4. Top Sector](#4)
* [5. Rating](#5)
* [6. Type of Ownership](#6)
* [7. Top States](#7)
* [8. Top HQ States](#8)
* [9. Revenue](#9)
* [10. Most suffered City](#10)
* [11. Flee or not?](#11)
* [12. Armed or not?](#12)
* [13. Black peoples suffered](#13)
* [14. All race victims in US](#14)
* [15. Relation between Age and Race](#15) 

<a id="intro"></a>
<font size="+2" color="blue"><b>Importing Packages and Libraries</b></font><br>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import plotly.express as px
import string
import re
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff

In [None]:
data_df = pd.read_csv("../input/data-analyst-jobs/DataAnalyst.csv",index_col=0) # col index is given as 0


In [None]:
data_df.info() # There are 15 relevant columns

In [None]:
data_df.describe(include="all").T

In [None]:
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

<a id="data"></a>
<font size="+2" color="blue"><b>Preprocessing</b></font><br>

* Extracting min and max salary from salary range and removing K and adding 000 and making it an integer for numerical operations
* Converting Revenue and Easy Apply from -1 to Unspecified
* Extracting State from Location
* Extracting State from HeadQuarters
* Remove the last characters of the company name
* If the value of Industry is not specified marking it as 'Unspecified'

In [None]:
data_df['Salary Estimate'] = data_df['Salary Estimate'].apply(lambda x: '0K-0K' if x == '-1' else x)
data_df['Salary Estimate'] = data_df['Salary Estimate'].apply(lambda x: x.split('(')[0])
data_df['Min Salary'] = data_df['Salary Estimate'].apply(lambda x: x.split('-')[0].replace('$',''))
data_df['Min Salary'] = data_df['Min Salary'].apply(lambda x: int(x.replace('K','000')) )

data_df['Max Salary']= data_df['Salary Estimate'].apply(lambda x: x.split('-')[1].replace('$',''))
data_df['Max Salary'] = data_df['Max Salary'].apply(lambda x: int(x.replace('K','000')))

data_df['Revenue'] = data_df['Revenue'].apply(lambda x: 'Unspecified' if x == '-1' else x)
data_df['Easy Apply'] = data_df['Easy Apply'].apply(lambda x: False if x == '-1' else True)
data_df['State'] = data_df['Location'].apply(lambda x: x[-2:])
data_df['HQState'] = data_df['Headquarters'].apply(lambda x: 'Unspecified' if x == '-1' else x.split(",")[1].strip())
data_df['Type of ownership'] = data_df['Type of ownership'].apply(lambda x: 'Unspecified' if x == '-1' else x)
data_df['US Job'] = data_df['HQState'].apply(lambda x: True if x.strip() in states else False)
data_df['Industry'] = data_df['Industry'].apply(lambda x : 'Unspecified' if x == '-1' else x)
data_df['Sector'] = data_df['Sector'].apply(lambda x : 'Unspecified' if x == '-1' else x)
data_df['Company Name'] = data_df['Company Name'].astype(str).apply(lambda x : x[:-4])
data_df.head(3)

<a id="1"></a>
<font size="+2" color="blue"><b>Top 15 companies (by number of jobs posted)</b></font><br>

In [None]:
company = data_df['Company Name'].value_counts().nlargest(n=15)
fig = px.bar(y=company.values,
       x=company.index,
       orientation='v',
       color=company.index,
       text=company.values,
       color_discrete_sequence= px.colors.qualitative.Bold)

fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside', 
                  marker_line_color='rgb(8,48,107)', 
                  marker_line_width=1.5, 
                  opacity=0.7)

fig.update_layout(width=1000, 
                  showlegend=False, 
                  xaxis_title="Company",
                  yaxis_title="Count",
                  title="Top 15 Companies by Job count")
fig.show()

<a id="2"></a>
<font size="+2" color="blue"><b>Top 15 Jobs titles </b></font><br>

In [None]:
jobs = data_df['Job Title'].value_counts().nlargest(n=15)
fig = px.bar(y=jobs.values, x=jobs.index, orientation='v', color=jobs.index, text=jobs.values, color_discrete_sequence= px.colors.qualitative.Bold)

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside', marker_line_color='rgb(8,48,107)',marker_line_width=1.5, opacity=0.7)
fig.update_layout(width=1000, showlegend=False, xaxis_title="Job Title", yaxis_title="Count", title="Top 15 Job titles")
fig.show()

<a id="3"></a>
<font size="+2" color="blue"><b>Top 10 'Industry' of jobs posted</b></font><br>

In [None]:
industry = data_df['Industry'].value_counts().nlargest(n=15)
fig = px.pie(industry, values = industry.values, names = industry.index, title="Top 10 'Industry' of the Jobs posted", color=industry.values, color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_traces(opacity=0.7,marker_line_color='rgb(8,48,107)', marker_line_width=1.5)
fig.update_layout(title_x=0.5)
fig.show()

<a id="4"></a>
<font size="+2" color="blue"><b>Top 10 'Sector' of jobs posted</b></font><br>

In [None]:
g = sns.barplot(y = data_df['Sector'].value_counts()[0:10],x = data_df['Sector'].value_counts()[0:10].index)
g.set_title("Top 10 sectors of the jobs posted")
g.set_xticklabels(labels = data_df['Sector'].value_counts()[0:10].index,rotation=90)
for p in g.patches:
        g.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

<a id="5"></a>
<font size="+2" color="blue"><b>Rating</b></font><br>

In [None]:
top_rating = data_df[data_df['Rating'] >= 4.0]
top_rating.head(3)

In [None]:
low_rating = data_df[data_df['Rating'] < 0.5]
low_rating.head(3)

In [None]:
labels = data_df.Rating.value_counts(bins=5).index
values = data_df.Rating.value_counts(bins=5).values

plt.figure(figsize = (8, 8))
plt.title("No. of companies across Rating bins")
ax = sns.barplot(x=labels, y=values)
for i, p in enumerate(ax.patches):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + 0.1, values[i],ha="center")

<a id="6"></a>
<font size="+2" color="blue"><b>Type of Ownership</b></font><br>

In [None]:

type_ownership_df = data_df['Type of ownership'].value_counts(normalize=True)
type_ownership_df

In [None]:
ownership = data_df['Type of ownership'].value_counts()
ownership = pd.DataFrame(ownership)
ownership = ownership.reset_index()
fig = px.pie(ownership, values='Type of ownership', names='index', title="Type of ownership", color_discrete_sequence = px.colors.qualitative.Vivid, hole = 0.5)
fig.show()

<a id="7"></a>
<font size="+2" color="blue"><b>States in which jobs are posted</b></font><br>

In [None]:
state_df = data_df['State'].value_counts().to_frame().reset_index().rename(columns={'index':'state','State':'count'}).sort_values(by='count',ascending=False)

fig = go.Figure(go.Bar(
    y= state_df['state'].sort_index(ascending=False), 
    x= state_df['count'].sort_index(ascending=False),  
     orientation='h',
      text=state_df['count'].sort_index(ascending=False),
    textposition='outside',
    marker_color=state_df['count'].sort_index(ascending=False),
))
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_layout(title_text='All States - Job Postings',yaxis_title='States',
                 xaxis_title='Total number of Jobs', title_x=0.5,height=1000)
fig.show()

<a id="8"></a>
<font size="+2" color="blue"><b>Head Quarters in which jobs are posted</b></font><br>

In [None]:
US_jobs_df = data_df[data_df['US Job'] == True]
hqstate_df = US_jobs_df['HQState'].value_counts().to_frame().reset_index().rename(columns={'index':'state','HQState':'count'}).sort_values(by='count',ascending=False)

fig = go.Figure(go.Bar(
    y= hqstate_df['state'].sort_index(ascending=False), 
    x= hqstate_df['count'].sort_index(ascending=False),  
     orientation='h',
      text=hqstate_df['count'].sort_index(ascending=False),
    textposition='outside',
    marker_color=hqstate_df['count'].sort_index(ascending=False),
))
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_layout(title_text='All HeadQuarters States - Job Postings',yaxis_title='States',
                 xaxis_title='Total number of Jobs', title_x=0.5,height=1000)
fig.show()

<a id="9"></a>
<font size="+2" color="blue"><b>Company Revenues </b></font><br>

In [None]:
g = sns.catplot(x="Revenue",y="Max Salary",kind= "bar",data=data_df, height=10, aspect=12/10)
g.set_xticklabels(rotation=90)

In [None]:
g1 = sns.catplot(x="Revenue",y="Min Salary",kind="point",data=data_df)
#g1.set_title("Point plot for Min Salary")
g1.set_xticklabels(rotation=90)

In [None]:
hist_data = [data_df['Min Salary'], data_df['Max Salary']]
group_labels = ['Minimum Salary','Maximum Salary'] # name of the dataset
colors = ['red','green']

fig = ff.create_distplot(hist_data, group_labels,colors=colors)
#fig.update_layout(title_text="Distribution of Min/Max Salary",title_x=0.5)
fig.show()


In [None]:
# Min Salary Histogram
sns.distplot(data_df['Min Salary'],rug=True, kde=True, color='g').set_title("Distribution of Mininum Salary for Data Analyst jobs")

In [None]:
# Max Salary Histogram
sns.distplot(data_df['Max Salary'],color='r',rug=False, kde=False).set_title("Distribution Plot for Max Salary")

In [None]:
print(data_df['HQState'].unique())
print(data_df['HQState'].nunique())

In [None]:
sns.catplot(x="State", y="Min Salary", kind="box", data=data_df, height=12, aspect=16/12)# hue="Easy Apply"

# Group by state and calculating Median for Min salary and Max Salary

In [None]:
group3 = data_df.groupby(['State']).median()
group3[['Min Salary', 'Max Salary']]

group1 = data_df.groupby(['State']).mean()
group1[['Min Salary', 'Max Salary']]

group2 = data_df.groupby(['State']).count()
group2[['Job Title']]

sns.catplot(x="State", y="Max Salary", kind="violin", data=data_df, height=10, aspect=14/10)# hue="Easy Apply", split=True

In [None]:

data_df.groupby('Industry')['Job Title'].count() \
    .plot(kind='bar', figsize=(15, 5), title='Target (accuracy group)')
plt.show()

In [None]:
data_df.groupby('Sector')['Job Title'].count() \
    .plot(kind='bar', figsize=(15, 5), color = 'm',title='Sectors and Job titles')
plt.show()

In [None]:

def clean_text(text):
   
    text_lc = "".join([word.lower() for word in text if word not in string.punctuation]) # remove puntuation
    text_rc = re.sub('\n', '', text_lc)
    text_rc = re.sub('[0-9]+', '', text_lc)
    tokens = re.split('\W+', text_rc)    # tokenization
    
    return text

In [None]:
data_df['Clean Job Description'] = data_df['Job Description'].apply(clean_text)

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import warnings 
warnings.filterwarnings('ignore')
from PIL import Image
def plot_wordcloud(text, mask=None, max_words=400, max_font_size=120, figure_size=(14.0,12.0), 
                   title = None, title_size=40, image_color=False):
    stopwords = set(STOPWORDS)
    more_stopwords = {'re','ha','nWe','Name','bring','Are','We','one', 'br', 'Unknown','Yes','nYou'}
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color='white',
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                          contour_width=2, 
                          contour_color='steelblue',
                    mask = mask)
    wordcloud.generate(text)
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size, 'color': 'red','verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'red', 'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()  
    
d = '../input/masks12/'

comments_text = str(data_df['Clean Job Description']).lower()
comments_mask = np.array(Image.open(d + 'mask_mask.jfif'))
plot_wordcloud(comments_text, comments_mask, max_words=800, max_font_size=100, 
               title = 'Most common words in Job Description', title_size=50, image_color=True)

# **For categorical variables - Use:
# sns.catplot(x="education", y="avg_training_score",data=df2);
# sns.catplot(x="education",y="avg_training_score",jitter=False,data=df2);
# sns.catplot(x="education",y="avg_training_score",hue="gender",data=df2);**

https://www.kaggle.com/raenish/cheatsheet-50-plotly-charts