In [None]:
# import modules
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pyecharts import options as opts
from pyecharts.charts import Bar, Line, Pie, Grid, WordCloud
import json

In [None]:
# import data
all_data = []
for i in range(1,11):
    with open('page%d.json' % i, "r", encoding = 'utf-8') as file:
        data = file.read()
        data1 = json.loads(data).get("zpData").get("jobList")
        data_list_tran = []
        for i in data1:
            d = dict()
            for key, value in i.items():
                
                d[key] = value
                if isinstance(value, list):
                    s = ""
                    for item in value:
                        s += str(item)+"/"
                    val = s
                    d[key] = val
                if isinstance(value, dict):
                    d[key] = json.dumps(value)
            data_list_tran.append(d)
    all_data.extend(data_list_tran)

In [None]:
# convert to DataFrame
df = pd.DataFrame(all_data)

In [None]:
# check data; 300 entries, 48 columns, some columns have missing values
df.info() 

In [None]:
# show all columns
pd.set_option('display.max_columns', None)

In [None]:
df.head()

##### data exploratory, variable used
+ securityId
+ skills
+ jobExperience
+ jobDegree
+ brandStageName
+ brandIndustry
+ salaryDesc

##### skills, wordcloud

In [None]:
# check variable skills
df['skills']

In [None]:
# import module
import jieba
from collections import Counter

In [None]:
# convert skills to a string
skills_str = '/'.join(df['skills'].dropna())

In [None]:
# cut skills string
# jieba is a Chinese text segmentation module
s = list(jieba.cut(skills_str))

In [None]:
# create a Counter object to count the frequency of each skill
counter = Counter(s)

In [None]:
# remove stopwords
def remove_stopwords(stopwordspath, counter):
    with open(stopwordspath, 'r', encoding='utf-8') as f:
        content = f.readlines()
        for c in content:
            c = c.strip()
            if c in counter:
                counter.pop(c)
    return counter

In [None]:
# remove stopwords from different stopword files
counter = remove_stopwords('baidu_stopwords.txt', counter)
counter = remove_stopwords('cn_stopwords.txt', counter)
counter = remove_stopwords('hit_stopwords.txt', counter)
counter = remove_stopwords('scu_stopwords.txt', counter)

In [None]:
wc_data = list(counter.items())

In [None]:
wc = WordCloud()
wc.add("", data_pair=wc_data)
wc.set_global_opts(title_opts=opts.TitleOpts(title="Skills Word Cloud"))
wc.render_notebook()

- From the word cloud, we can see that job experience is very important. You should have related background, like computer science, maths, statistic, etc. Tools like python, sql, R, data mining are also required. 

##### jobLabels

In [None]:
df['jobExperience']

In [None]:
df1 = df.groupby('jobExperience').agg({'securityId': 'count'})

In [None]:
df1

In [None]:
# sort by hand
df2 = df1.loc[["经验不限", "在校/应届", "1年以内", "1-3年", "3-5年", "5-10年", "10年以上"]]

In [None]:
# create a new column 'total' to calculate the cumulative sum of 'securityId'
df2['total']= df2['securityId'].cumsum()

In [None]:
# calculate the rate of each job experience
df2['rate'] = round(df2['total']/df2['securityId'].sum(), 2)

In [None]:
df2['rate2'] = round(df2['securityId']/df2['securityId'].sum(), 2)

In [None]:
df2

In [None]:
pie = Pie()
pie.add(
    series_name="",
    data_pair=[list(z) for z in zip(df2.index, df2['rate2'])],
    radius=["30%", "75%"],
    center=["50%", "50%"],
    label_opts=opts.LabelOpts(formatter="{b}: {d}%")
)
pie.set_global_opts(title_opts=opts.TitleOpts(title="Job Experience Distribution", pos_left="left", pos_top="0.5%"),
                     legend_opts=opts.LegendOpts(pos_left="right", orient="vertical")
                    )
pie.render_notebook()

- According to the data, people with 3 to 5 years experience can satisfy 90% job requirement.

##### jobDegree

In [None]:
df['jobDegree']

In [None]:
df3 = df.groupby('jobDegree').agg({'securityId': 'count'})

In [None]:
# sort by hand
df4 = df3.loc[['学历不限', '中专/中技', '高中', '大专', '本科', '硕士']]

In [None]:
# create a new column 'total' to calculate sum of 'securityId'
df4['total'] = df4['securityId'].sum()

In [None]:
# create a new column 'rate' to calculate the rate of each job degree
df4['rate'] = round(df4['securityId']/df4['total'], 2)

In [None]:
df4

In [None]:
pie = Pie()
pie.add(
    series_name="",
    data_pair=[list(z) for z in zip(df4.index, df4['rate'])],
    radius=["0%", "75%"],
    center=["50%", "50%"],
    label_opts=opts.LabelOpts(formatter="{b}: {d}%")
)
pie.set_global_opts(title_opts=opts.TitleOpts(title="Job Degree Distribution", pos_left="left", pos_top="0.5%"),
                     legend_opts=opts.LegendOpts(pos_left="right", orient="vertical"))
pie.render_notebook()

- From the data, we can see that 55% jobs require Bachelor degree. 

##### brandStageName

In [None]:
df['brandStageName']

In [None]:
df5 = df.groupby('brandStageName').agg({'securityId': 'count'})

In [None]:
df5

In [None]:
# sort by hand
df6 = df5.loc[['不需要融资', '未融资', '天使轮', 'A轮', 'B轮', 'C轮', 'D轮及以上', '已上市']]

In [None]:
# create a new column 'total' to calculate the sum of 'securityId'
df6['total'] = df6['securityId'].sum()

In [None]:
# create a new column 'rate' to calculate the rate of each brand stage
df6['rate'] = round(df6['securityId']/df6['total'], 2)

In [None]:
df6

In [None]:
pie = Pie()
pie.add(
    series_name="",
    data_pair=[list(z) for z in zip(df6.index, df6['rate'])],
    radius=["30%", "75%"],
    center=["50%", "50%"],
    label_opts=opts.LabelOpts(formatter="{b}: {d}%")
)
pie.set_global_opts(title_opts=opts.TitleOpts(title="Company Stage Distribution"), 
                     legend_opts=opts.LegendOpts(pos_left="right"))
pie.render_notebook()

- From the data, we can see that 29% of companies do not require financing, 49% have not yet obtained financing, and only 7% have already gone public.

##### brandIndustry

In [None]:
df['brandIndustry']

In [None]:
df.groupby('brandIndustry').agg({'securityId': 'count'}).sort_values(by='securityId', ascending=False).head(10)

- From the data, we can see that the top four industries for data analysts are computer software, internet, consulting, and big data.

##### salaryDesc

In [None]:
df['salaryDesc']

In [None]:
# Function to transform salary description into a range of annual salary
def salary_trans(x):
    #'5-10K·13薪'
    s = x.split('·')
    if len(s) == 2:
        m = s[1].replace('薪', '')
        r = s[0].lower().replace('k', '').split('-')
        y_min = int(r[0]) * 1000*int(m)
        y_max = int(r[1]) * 1000*int(m)
        return [y_min, y_max]
    # '1500-2000元/月'
    elif s[0].find('月') != -1:
        r = s[0].replace('元/月', '').split('-')
        y_min = int(r[0]) * 12
        y_max = int(r[1]) * 12
        return [y_min, y_max]
    # '90-100元/天'
    elif s[0].find('天') != -1:
        r = s[0].replace('元/天', '').split('-')
        y_min = int(r[0]) * 5 * 52
        y_max = int(r[1]) * 5 * 52
        return [y_min, y_max]
     # '25-70元/时'
    elif s[0].find('时') != -1:    
        r = s[0].replace('元/时', '').split('-')
        y_min = int(r[0]) * 8 * 5 * 52
        y_max = int(r[1]) * 8 * 5 * 52
        return [y_min, y_max]
    # '1000-4000元/周'
    elif s[0].find('周') != -1:
        r = s[0].replace('元/周', '').split('-')
        y_min = int(r[0]) * 52
        y_max = int(r[1]) * 52
        return [y_min, y_max]
    # '5-10K'
    elif s[0].find('K') != -1:
        r = s[0].lower().replace('k', '').split('-')
        y_min = int(r[0]) * 1000 * 12
        y_max = int(r[1]) * 1000 * 12
        return [y_min, y_max]
    else:
        return [0, 0]

In [None]:
df['annualSalary'] = df['salaryDesc'].apply(salary_trans) 

In [None]:
df['middle_annualSalary'] = df['annualSalary'].apply(lambda x: (x[0] + x[1]) / 2)

In [None]:
df['month_middle'] = df['middle_annualSalary'] / 12

In [None]:
df['month_middle']

In [None]:
df["month_group"] = pd.cut(df['month_middle'], bins=[0, 3000, 5000, 8000, 10000, 13000, 15000, 20000, 25000, 30000, 35000, 40000])

In [None]:
df7 = df.groupby('month_group').agg({'securityId': 'count'})

In [None]:
bar = Bar()
bar.add_xaxis(df7.index.astype(str).tolist())
bar.add_yaxis("Number of Jobs", df7['securityId'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="Monthly Salary Distribution"),
                        xaxis_opts=opts.AxisOpts(name="Monthly Salary", axislabel_opts=opts.LabelOpts(rotate=45)),
                        yaxis_opts=opts.AxisOpts(name="Number of Jobs"),
                        legend_opts=opts.LegendOpts(is_show=False))
bar.width = "1100px"
bar.render_notebook()

- Based on the data results, there are 107 companies offering a monthly salary of 5,000 to 8,000 yuan, followed by 58 companies offering 3,000 to 5,000 yuan, 48 companies offering 8,000 to 10,000 yuan, and 47 companies offering 10,000 to 13,000 yuan.