# Summary of findings for what's needed to get a good pay for data analysts:
- Interesting to see that jobs require Python pays better than R.
- Comparing job descriptions require excel or sql seems pretty even in terms of average salary for these two skills.
- Comparing two of the most required visualization skills for data analysts position. Interesting to see that jobs require tableau pays 10k more than Power Bi
- Companies with 2.2 rating pays the most average salary at around 95.7k.
- Drug & Health Stores pays the most out of all industries, followed by education training services and health care products manufacturing.
- Once again Biotech % pharmaceuticals sectors pays the most, followed by real estate.
- Companies with one to five millions revenue pays the most.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/data-analyst-jobs/DataAnalyst.csv')
df.head()

## Data Cleaning

In [None]:
# Drop duplicate column
df.drop(['Unnamed: 0'], axis=1,inplace=True)

In [None]:
# Parse salary
df = df[df['Salary Estimate'] != '-1']
salary = df['Salary Estimate'].apply(lambda x: x.split('(')[0])
minus_Kd = salary.apply(lambda x: x.replace('K','').replace('$',''))
df['min_salary'] = minus_Kd.apply(lambda x: int(x.split('-')[0]))
df['max_salary'] = minus_Kd.apply(lambda x: int(x.split('-')[1]))
df['avg_salary'] = (df.min_salary+df.max_salary)/2

In [None]:
# Parse company text
df['company_txt'] = df.apply(lambda x: x['Company Name'] if x['Rating'] < 0 else x['Company Name'][:-3].strip(), axis=1)

In [None]:
# Parse location to see what state the companies in
df['job_state'] = df['Location'].apply(lambda x: x.split(',')[-1])
df['job_state'].value_counts()

In [None]:
# Find out the age of the company
df['age'] = df.Founded.apply(lambda x: x if x < 1 else 2020 - x)

In [None]:
# Parsing job description for some of the most required skills for data analysts
df['sql_yn'] = df['Job Description'].apply(lambda x: 1 if 'sql' in x.lower() else 0)
df['python_yn'] = df['Job Description'].apply(lambda x: 1 if 'python' in x.lower() else 0)
df['R_yn'] = df['Job Description'].apply(lambda x: 1 if 'r studio' in x.lower() or 'r-studio' in x.lower() else 0)
df['tableau_yn'] = df['Job Description'].apply(lambda x: 1 if 'tableau' in x.lower() else 0)
df['powerbi_yn'] = df['Job Description'].apply(lambda x: 1 if 'power bi' in x.lower() or 'powerbi' in x.lower () else 0)
df['excel_yn'] = df['Job Description'].apply(lambda x: 1 if 'excel' in x.lower() else 0)

In [None]:
# Job description length
df['desc_len'] = df['Job Description'].apply(lambda x: len(x))
df.desc_len

In [None]:
df.columns

In [None]:
# Change these columns to numeric
df[['Rating','Founded','min_salary','max_salary','avg_salary']] = df[['Rating','Founded','min_salary','max_salary','avg_salary']].apply(pd.to_numeric)
print(df.dtypes)

## Data Analysis

In [None]:
## Analyzing Continuous Variables

In [None]:
df.describe()

In [None]:
# histogram of company ratings - left skewed distribution
df.Rating.hist()

In [None]:
# histogram of mininum salary - right skewed 
df.min_salary.hist()

In [None]:
# histogram of maximum salary - right skewed 
df.max_salary.hist()

In [None]:
# histogram of average salary - right skewed 
df.avg_salary.hist()

In [None]:
# hist of company age - right skewed
df.age.hist()

In [None]:
- hist of job description length - right skewed
df.desc_len.hist()

In [None]:
# a lot of outliers for age and quite a lot for average salary as well
df.boxplot(column = ['age','avg_salary','Rating'])

In [None]:
# looking at rating separately, we see that outliers are below 2.
df.boxplot(column = 'Rating')

In [None]:
# a lot of outliers.
df.boxplot(column = 'desc_len')

Removing outliers and let's look at the distributions again.

In [None]:
# age
df_age_no_outlier = df[df.age >= 90]
df_age_no_outlier.age.hist()

In [None]:
#average salary
df_avg_salary_no_outlier = df[df.avg_salary >= 110]
df_avg_salary_no_outlier.avg_salary.hist()

In [None]:
# Rating
df_rating_no_outlier = df[df.Rating <= 2]
df_rating_no_outlier.Rating.hist()

In [None]:
# Description length
df_desc_length_no_outlier = df[df.desc_len >= 7500]
df_desc_length_no_outlier.desc_len.hist()

In [None]:
# looking at the correlation of these columns, none of them have any positive correlations between each other.
df[['age','avg_salary','Rating','desc_len']].corr()

In [None]:
# a heatmap visualization for better representation of the correlation.
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(df[['age','avg_salary','Rating','desc_len']].corr(),vmax=.3, center=0, cmap=cmap,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### Analyzing categorical variables

In [None]:
df.columns

In [None]:
# dataframe for categorical variables
df_cat = df[['Location','Headquarters','Size','Type of ownership','Industry','Sector','Revenue','company_txt','job_state','Easy Apply','sql_yn','python_yn','R_yn','tableau_yn','powerbi_yn','excel_yn']]

In [None]:
for i in df_cat.columns:
    cat_num = df_cat[i].value_counts()
    print("graph for %s: total = %d" %(i, len(cat_num)))
    chart = sns.barplot(x=cat_num.index, y=cat_num)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.show()

In [None]:
for i in df_cat[['Location','Headquarters','Industry','company_txt']].columns:
    cat_num = df_cat[i].value_counts()[:20]
    print("graph for %s: total = %d" % (i, len(cat_num)))
    chart = sns.barplot(x=cat_num.index, y=cat_num)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.show()

### Pivot tables

In [None]:
# Interesting to see that jobs require Python pays better than R.
pd.pivot_table(df, index=['python_yn','R_yn'],values='avg_salary')

In [None]:
# This category seems pretty even in terms of average salary for these two skills.
pd.pivot_table(df, index=['sql_yn','excel_yn'],values='avg_salary')

In [None]:
# This category compares two of the most required visualization skills for data analysts position.
# Interesting to see that jobs require tableau pays 10k more than Power Bi.
pd.pivot_table(df, index=['tableau_yn','powerbi_yn'],values='avg_salary')

In [None]:
# Looking at each columns vs average salary 
df_pivots = df[['Rating','Industry','Sector','Revenue','sql_yn','python_yn','R_yn','tableau_yn','powerbi_yn','excel_yn','avg_salary']]

In [None]:
# Insights:
# We see that companies with 2.2 rating pays the most average salary at around 95.7k.
# Drug & Health Stores pays the most out of all industries, followed by education training services and health care products manufacturing.
# Once again Biotech % pharmaceuticals sectors pays the most, followed by real estate.
# Companies with one to five millions revenue pays the most in this category.
try:
    for i in df_pivots.columns:
        print(i)
        print(pd.pivot_table(df_pivots, index=i, values='avg_salary').sort_values('avg_salary',ascending=False))
except ValueError:
    pass

In [None]:
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
words = " ".join(df['Job Description'])

def punctuation_stop(text):
    """remove punctuation and stop words"""
    filtered = []
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    for w in word_tokens:
        if w not in stop_words and w.isalpha():
            filtered.append(w.lower())
    return filtered


words_filtered = punctuation_stop(words)

text = " ".join([ele for ele in words_filtered])

wc= WordCloud(background_color="white", random_state=1,stopwords=STOPWORDS, max_words = 2000, width =800, height = 1500)
wc.generate(text)

plt.figure(figsize=[10,10])
plt.imshow(wc,interpolation="bilinear")
plt.axis('off')
plt.show()

Reference: 
    - Ken Jee: https://github.com/PlayingNumbers/ds_salary_proj/blob/master/data_eda.ipynb
    
    - importdata : https://github.com/importdata/kpop-analysis/blob/master/K_pop_Exploratory_Data_Analysis.ipynb
    
