In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import os

In [None]:
df = pd.read_csv('../input/data-engineer-jobs/DataEngineer.csv')
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['Job Title'].value_counts()

In [None]:
# show top 25 only
temp = df['Job Title'].value_counts()
sns.barplot(x=temp.index[0:25], y=temp[0:25])
plt.title('Top 25 - Job Title')
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
df['Salary Estimate'].value_counts()

In [None]:
# show top 25 only
temp = df['Salary Estimate'].value_counts()
sns.barplot(x=temp.index[0:25], y=temp[0:25])
plt.title('Top 25 - Salary Estimate')
plt.xticks(rotation=90)
plt.grid()
plt.show()

### The previous evaluation is actually not very helpful. We extract the lower and upper bounds as numeric values in the following.

In [None]:
def aux1(i_string):
    return pd.to_numeric((i_string.split('K')[0]).split('$')[1])

def aux2(i_string):
    return pd.to_numeric((i_string.split('K')[1]).split('$')[1])

In [None]:
df['Salary_LoB'] = list(map(aux1, df['Salary Estimate']))
df['Salary_UpB'] = list(map(aux2, df['Salary Estimate']))

In [None]:
df['Salary_Mid'] = (df['Salary_LoB'] + df['Salary_UpB'])/2 

In [None]:
df.Salary_LoB.hist(bins=25)
plt.title('Salary Lower Bound (in 1000 USD)')
plt.show()

In [None]:
df.Salary_LoB.describe()

In [None]:
df.Salary_UpB.hist(bins=25)
plt.title('Salary Upper Bound (in 1000 USD)')
plt.show()

In [None]:
df.Salary_UpB.describe()

In [None]:
df.Salary_Mid.hist(bins=25)
plt.title('Salary Mid Point of range (in 1000 USD)')
plt.show()

In [None]:
df.Salary_Mid.describe()

In [None]:
df.Rating.plot(kind='hist')
plt.title('Rating')
plt.grid()
plt.show()

In [None]:
df['Company Name'].value_counts()

In [None]:
# utility function for text cleaning
def chop_name(i_string):
    return i_string.split('\n')[0]

In [None]:
# show top 25 only
temp = df['Company Name'].value_counts()
sns.barplot(x=list(map(chop_name,temp.index[0:25])), y=temp[0:25])
plt.title('Top 25 - Company Name')
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
# add clean company name as addition column
df['Company'] = list(map(chop_name,df['Company Name']))

In [None]:
df['Headquarters'].value_counts()

In [None]:
# show top 25 only
temp = df['Headquarters'].value_counts()
sns.barplot(x=temp.index[0:25], y=temp[0:25])
plt.title('Top 25 - Headquarters')
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
# Size
df['Size'] = df['Size'].replace("-1","Unknown") # merge "-1" into "Unknown"
df['Size'].value_counts().plot(kind='bar')
plt.grid()
plt.show()

In [None]:
df.Founded.plot(kind='hist')
plt.title('Founded')
plt.grid()
plt.show()

In [None]:
# show Founded w/o missings (-1)
temp = df.Founded[df.Founded>-1]
plt.hist(temp,50)
plt.title('Founded, excluding missing values')
plt.grid()
plt.show()

In [None]:
# Founded summary
temp.describe()

In [None]:
# Type of ownership
df['Type of ownership'] = df['Type of ownership'].replace("-1","Unknown") # merge "-1" into "Unknown"
df['Type of ownership'].value_counts().plot(kind='bar')
plt.grid()
plt.show()

In [None]:
df['Industry'].value_counts()

In [None]:
# show top 25 only
temp = df['Industry'].value_counts()
sns.barplot(x=temp.index[0:25], y=temp[0:25])
plt.title('Top 25 - Industry')
plt.xticks(rotation=90)
plt.grid()
plt.show()

In [None]:
df['Sector'].value_counts().plot(kind='bar')
plt.title('Sector')
plt.grid()
plt.show()

In [None]:
# Revenue
df['Revenue'] = df['Revenue'].replace("-1","Unknown / Non-Applicable") # merge "-1" into "Unknown..."
df['Revenue'].value_counts().plot(kind='bar')
plt.title('Revenue')
plt.grid()
plt.show()

In [None]:
df['Easy Apply'].value_counts().plot(kind='bar')
plt.title('Easy Apply')
plt.grid()
plt.show()

In [None]:
# means by company
df_means = df.groupby('Company').mean()
df_means.head()

> #### A few examples:

In [None]:
sel_company = 'Amazon'
df_means[df_means.index==sel_company]

In [None]:
df_temp = df[df.Company==sel_company]
df_temp.Salary_Mid.hist()
plt.title(sel_company)
plt.show()

In [None]:
sel_company = 'Apple'
df_means[df_means.index==sel_company]

In [None]:
df_temp = df[df.Company==sel_company]
df_temp.Salary_Mid.hist()
plt.title(sel_company)
plt.show()

In [None]:
sel_company = 'Lockheed Martin'
df_means[df_means.index==sel_company]

In [None]:
df_temp = df[df.Company==sel_company]
df_temp.Salary_Mid.hist()
plt.title(sel_company)
plt.show()

In [None]:
sel_company = 'Google'
df_means[df_means.index==sel_company]

In [None]:
df_temp = df[df.Company==sel_company]
df_temp.Salary_Mid.hist()
plt.title(sel_company)
plt.show()

# Job Descriptions

In [None]:
stopwords = set(STOPWORDS)
text = " ".join(txt for txt in df['Job Description'])

In [None]:
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

### Company specific wordclouds

In [None]:
sel_company = 'Apple'
df_temp = df[df.Company==sel_company]
text = " ".join(txt for txt in df_temp['Job Description'])

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
sel_company = 'Amazon'
df_temp = df[df.Company==sel_company]
text = " ".join(txt for txt in df_temp['Job Description'])

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
sel_company = 'Google'
df_temp = df[df.Company==sel_company]
text = " ".join(txt for txt in df_temp['Job Description'])

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

### Sector specific wordclouds

In [None]:
sel_sector = 'Finance'
df_temp = df[df.Sector==sel_sector]
text = " ".join(txt for txt in df_temp['Job Description'])

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
sel_sector = 'Manufacturing'
df_temp = df[df.Sector==sel_sector]
text = " ".join(txt for txt in df_temp['Job Description'])

wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=500,
                      width = 600, height = 400,
                      background_color="white").generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

### If you are interested look also at my notebook for Data Scientist Jobs:
https://www.kaggle.com/docxian/data-scientist-jobs