In [None]:
pip install sidetable

In [None]:
import pandas as pd
import sidetable

In [None]:
df = pd.read_csv('../input/data-science-job-posting-on-glassdoor/Cleaned_DS_Jobs.csv')

In [None]:
df.head()

In [None]:
df.stb.missing()

In [None]:
df.columns

In [None]:
df['Job Title'].unique()

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from wordcloud import ImageColorGenerator


In [None]:
Stopwords = set(STOPWORDS)
text = ' '.join(df['Job Title'])
wordcloud = WordCloud(width=1400, height=800, margin=0, background_color='snow',stopwords=Stopwords).generate(text)


In [None]:
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(y=1000)
plt.tight_layout(pad = 0)
plt.show()

In [None]:
df.columns

In [None]:
df['Salary Estimate'].unique()

In [None]:
df['Salary Estimate'].str.strip().unique()

In [None]:
df['Revenue'].unique()

In [None]:
df.stb.freq(['Revenue'])

In [None]:
import plotly.graph_objects as go

In [None]:
df['Revenue'] = df['Revenue'].apply(lambda x: x[:-6])
df['Revenue'] = df['Revenue'].apply(lambda x: "Not Available" if 
    x == "Unknown / Non-Applicable" or 
    x == "-1" or 
    x == "" or
    x == "Unknown / Non-Appl"
    else x)


In [None]:
df.stb.freq(['Revenue'])

In [None]:
fig = go.Figure([
    go.Bar(x=df['Revenue'].unique(), y=df['Revenue'].value_counts())
])
fig.update_layout(title="Number of Companies by revenue")
fig.update_traces(marker_color='green', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.show()

In [None]:
df['Size'].unique()

In [None]:
df.stb.freq(['Size'])

In [None]:
df['Size'] = df['Size'].apply(lambda x: "Unknown" if x == "-1" else x)

In [None]:
import re
def size_filter(size):
    y = re.sub("\s+[a-z]+", "", size)
    x = re.sub("\s", "-", y)
    return x

In [None]:
df['Size'] = df['Size'].apply(size_filter)

In [None]:
df['Size'].unique()

In [None]:
fig = go.Figure(data=[go.Pie(labels=df['Size'].unique(), values =df['Size'].value_counts(),textinfo='label+percent', hole=.2)])
fig.update_layout(title="Number of Companies Sizes")
fig.show()

In [None]:
def salary_categorizer(salary):
    if int(salary.split('-',1)[1]) < 100:
        return "<100"
    if 150 > int(salary.split('-',1)[1]) > 100:
        return "100-150"
    if 200 > int(salary.split('-',1)[1]) > 150:
        return "150-200"
    if 250 > int(salary.split('-',1)[1]) > 200:
        return "200-250"
    else:
        return ">250"

In [None]:
df['Salary Estimate'] = df['Salary Estimate'].apply(salary_categorizer)

In [None]:
df['Salary Estimate'].unique()

In [None]:
import plotly.express as px
fig = px.bar(df, x='Salary Estimate',
    labels={'Salary Estimate': "Salary Range in Thousands"}, title="Salary Ranges")
fig.show()

In [None]:
df.head()

In [None]:
df['Type of ownership'].unique()

In [None]:
df['Type of ownership'] = df['Type of ownership'].apply(lambda x: "Unknown" if x=="-1" else x)

In [None]:
df['Industry'].unique()

In [None]:
df['Industry'] = df['Industry'].apply(lambda x: "Unknown" if x=="-1" else x)

In [None]:
top_industries = df.stb.freq(['Industry']).reset_index()

In [None]:
top_industries[:10]

In [None]:
fig = px.scatter(top_industries[:10], x="Industry", y="count",
    size="percent", title="Jobs by industry")
fig.show()

In [None]:
df.head()

In [None]:
Stopwords = set(STOPWORDS)
text = ' '.join(df['Job Description'])
wordcloud = WordCloud(width=1600, height=800, margin=0, background_color='red',stopwords=Stopwords).generate(text)

In [None]:
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")

plt.tight_layout(pad = 0)
plt.show()

In [None]:
df['Location'].unique()

In [None]:
df['State'] = df['Location'].apply(lambda x: x.strip()[-2:])

In [None]:
df['State'].unique()

In [None]:
x = "Chantilly, VA "
x.strip()[-2:]

In [None]:
import plotly.figure_factory as ff

In [None]:
fig.update_geos(
    visible=False, resolution=50, scope="usa",
    showcountries=True, countrycolor="Black",
    showsubunits=True, subunitcolor="Blue"
)

In [None]:
df.head()

In [None]:
x = "New York, NY	"
x.strip().split(",")[0]

In [None]:
df['City'] = df['Location'].apply(lambda x: x.strip().split(",")[0])

In [None]:
df['City']

In [None]:
f = go.FigureWidget()

f.layout.title.text = "Jobs By Top 10 Cities"

f.add_bar(y=df['job_state'].value_counts()[0:10],name="State", text=df['State'].values,textposition="outside")
f.add_scatter(y=df['City'].value_counts()[0:10], name="Cities",text=df['City'].values, x0=0)

f

In [None]:
df.columns

In [None]:
skills_summary = {
    "python": df['python'].sum(),
    "tableau": df['tableau'].sum(),
    "excel": df['excel'].sum(),
    "aws": df['aws'].sum(),
    "spark": df['spark'].sum(),
    "big_data": df['big_data'].sum(),
    "hadoop": df['hadoop'].sum()
}

In [None]:
fig = go.FigureWidget()

fig.layout.title.text = "Popular Skills"

fig.add_box(x=list(skills_summary.keys()), y=list(skills_summary.values()))


fig

In [None]:
df['job_simp'].unique()

In [None]:
df.groupby('State')["Rating"].mean().reset_index()

In [None]:
fig = px.histogram(df, x="Rating")
fig

In [None]:
df.stb.freq(["Rating"])

In [None]:
import plotly.figure_factory as ff
import numpy as np
# np.random.seed(1)

hist_data = [df['Rating']]
group_labels = ['Rating'] # name of the dataset
colors = ['#A56CC1']

fig = ff.create_distplot(hist_data, group_labels, colors=colors)
fig.update_layout(title_text='Rating Distribution')
fig.show()

In [None]:
df.corr()

In [None]:
import seaborn as sns

In [None]:
fig,ax = plt.subplots(figsize=(10, 10))   
sns.heatmap(df.corr(), ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap='twilight');