# Import

In [1]:
from calendar import monthrange
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import re
from string import digits
import time
import pandas as pd
import numpy as np
from tqdm import tqdm 
import func
from func import retrieve_job_post_data, global_id, to_quarter, quarter_to_month
from pyscbwrapper import SCB
from ast import literal_eval

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


# Retrive Danish data

### Retrieve data for regions

#### Retrieve monthly data 

In [2]:
# define regions in the correct format
regions = ['region-hovedstaden', 'region-sjaelland', 'region-midtjylland', 'region-syddanmark', 'region-nordjylland']

In [3]:
df_DK_jobs_monthly = retrieve_job_post_data(areas = regions, years = [*range(2004, 2020)], months = [*range(1, 13)], base_url = "https://www.jobindex.dk/jobsoegning/", country = 'DK')

100%|██████████| 189/189 [29:11<00:00,  9.64s/it]


In [4]:
# save to csv 
df_DK_jobs_monthly.to_csv('data/job_posts/df_DK_jobs_monthly.csv', index = False)

#### Retrieve quarterly data 

In [5]:
years = [*range(2004, 2020)]
months = [1, 4, 7, 10]
quarterly = True
# define regions in the correct format
regions = ['region-hovedstaden', 'region-sjaelland', 'region-midtjylland', 'region-syddanmark', 'region-nordjylland']

In [None]:
df_DK_jobs_quarterly = retrieve_job_post_data(areas = regions, years = years, months = months, base_url = "https://www.jobindex.dk/jobsoegning/", country = 'DK', quarterly = quarterly)

In [7]:
# save to csv 
df_DK_jobs_quarterly.to_csv('data/job_posts/df_DK_jobs_quarterly.csv', index = False)

## Retrieve Swedish data 

#### Retrieve monthly data 

In [12]:
SE_regions = [ 'Blekinge län', 'Dalarnas län', 'Gotlands län', 'Gävleborgs län',
              'Hallands län', 'Jämtlands län', 'Jönköpings län',  'Kalmar län',
              'Kronobergs län',  'Norrbottens län', 'Skåne län', 'Stockholms län',
              'Södermanlands län',  'Uppsala län', 'Värmlands län', 'Västerbottens län', 
              'Västernorrland län','Västmanlands län', 'Västra Götalands län', 'Örebro län', 
              'Östergötlands län']

# lower case all municipalities 
SE_regions = [x.lower() for x in  SE_regions]

# replace æ, ø, å with ae, oe, aa
SE_regions = [x.replace(" ", "-") for x in SE_regions]
SE_regions = [x.replace("å", "aa") for x in SE_regions]
SE_regions = [x.replace("ä", "ae") for x in SE_regions]
SE_regions = [x.replace("ö", "oe") for x in SE_regions]


#SE_regions

In [13]:
base_url = 'https://www.jobbsafari.se/jobbsoekning/'

In [None]:
df_SE_jobs_monthly = retrieve_job_post_data(areas = SE_regions, years = [*range(2004, 2020)], months = [*range(1, 13)], base_url = base_url, country = 'SE')

In [15]:
# save to csv 
df_SE_jobs_monthly.to_csv('data/job_posts/df_SE_jobs_monthly.csv', index = False)

#### Retrieve quarterly data 

In [18]:
years = [*range(2004, 2020)]
months = [1, 4, 7, 10]
quarterly = True

In [None]:
df_SE_jobs_quarterly = retrieve_job_post_data(areas = SE_regions, years = [*range(2004, 2020)], months = months, base_url = base_url, country = 'SE', quarterly = quarterly)

In [24]:
# save to csv 
df_SE_jobs_quarterly.to_csv('data/job_posts/df_SE_jobs_quarterly.csv', index = False)

## Retrieve Norwegian data 

In [20]:
NO_regions = sorted([
    'Østfold', 'Akershus', 'Oslo', 'Hedmark', 'Oppland',
    'Buskerud', 'Vestfold', 'Telemark', 'Aust-Agder', 'Vest-Agder', 
    'Rogaland', 'Hordaland', 'Sogn og Fjordane', 'Møre og Romsdal', 
    'Nordland', 'Troms', 'Finnmark', 'Sør-Trøndelag', 'Nord-Trøndelag'])

# lower case all municipalities 
NO_regions = [x.lower() for x in  NO_regions]

# replace æ, ø, å with ae, oe, aa
NO_regions = [x.replace(" ", "-") for x in NO_regions]
NO_regions = [x.replace("å", "aa") for x in NO_regions]
NO_regions = [x.replace("ä", "ae") for x in NO_regions]
NO_regions = [x.replace("ö", "oe") for x in NO_regions]
NO_regions = [x.replace("ø", "oe") for x in NO_regions]

# base url
base_url = 'https://www.jobbsafari.no/jobbsoeking/'

#NO_regions

#### Retrieve monthly data

In [None]:
# Run function
df_NO_jobs_monthly = retrieve_job_post_data(areas = NO_regions, years = [*range(2004, 2020)], months = [*range(1, 13)], base_url = base_url, country = 'NO')

In [22]:
# save to csv 
df_NO_jobs_monthly.to_csv('data/job_posts/df_NO_jobs_monthly.csv', index = False)

#### Retrieve quarterly data

In [28]:
years = [*range(2004, 2020)]
months = [1, 4, 7, 10]
quarterly = True

In [None]:
# Run function
df_NO_jobs_quarterly = retrieve_job_post_data(areas = NO_regions, years = [*range(2004, 2020)], months = months, base_url = base_url, country = 'NO', quarterly = quarterly)

In [33]:
# save to csv 
df_NO_jobs_quarterly.to_csv('data/job_posts/df_NO_jobs_quarterly.csv', index = False)

## Preprocessing

### Correct region format

In [2]:
def quarter_to_month(df, variable_names = ['region','date', 'population', 'pop_danish_share'], orig_format = '%YK%m'):
    """
    Change data format from quarterly to monthly to merge on the existing data
    
    Parameters:
    ===========
    df: pandas dataframe
    variable_names: a list of variable names in the dataframe
    orig_format: original date format
    
    Example:
    ========
        quarter_to_month(df = df, variable_names = ['region','date', 'population', 'pop_danish_share'])
    """
    
    #Change data to monthly data with the quarterly figures 
    df= pd.DataFrame(np.repeat(df.values,3, axis=0))
    variable_range = range(len(variable_names))
    df = df.rename(index=str, columns={i:variable_names[i] for i in variable_range})
    df['year'] = pd.to_datetime(df['date'], format=orig_format).dt.to_period('y')
    df['month'] = df.groupby(['ID','year']).cumcount()+1
    df['date'] = pd.to_datetime(df.year.astype(str) + '-' + df.month.astype(str))
    df= df.drop(['year', 'month'], axis = 1)
    return(df)

In [3]:
# create function

def preprocess_job_posts(path_job_posts ='data/job_posts/df_DK_jobs_quarterly.csv', path_labour_force = "data/job_posts/df_DK_labour_force.csv", monthly = False):
    # load data and correct date format
    df_job_posts = pd.read_csv(path_job_posts, parse_dates = ['date']) 

    # load in df with area format
    area = func.global_id()

    # get correct area format
    df_job_posts = df_job_posts.merge(area[['ID', 'jobindex']], right_on = 'jobindex', left_on = 'area')
    df_job_posts = df_job_posts.drop(['jobindex', 'area'], axis = 1)

    # read in  labour force file
    df_labour_force = pd.read_csv(path_labour_force, parse_dates = ['date']) 

    # sort values 
    df_labour_force = df_labour_force.sort_values(['ID', 'date'])

    if monthly == True:
        df_labour_force = quarter_to_month(df_labour_force, variable_names = ['date', 'labour_force', 'ID'], orig_format = '%Y-%m-%d')

    # merge data frames
    df_job_posts  =  df_job_posts.merge(df_labour_force, left_on = ['date', 'ID'], right_on= ['date', 'ID'], how = 'left')

    # get sectors in correct format 
    df_job_posts.sectors = df_job_posts.sectors.apply(literal_eval)
    df_job_posts['sector_information_technology'] = [val[0][0][1] for val in df_job_posts.sectors]
    df_job_posts['sector_engineering_technology'] = [val[0][1][1] for val in df_job_posts.sectors]
    df_job_posts['sector_management_staff'] = [val[0][2][1] for val in df_job_posts.sectors]
    df_job_posts['sector_trade_service'] = [val[0][3][1] for val in df_job_posts.sectors]
    df_job_posts['sector_industry_craft'] = [val[0][4][1] for val in df_job_posts.sectors]
    df_job_posts['sector_sales_communication'] = [val[0][5][1] for val in df_job_posts.sectors]
    df_job_posts['sector_teaching'] = [val[0][6][1] for val in df_job_posts.sectors]
    df_job_posts['sector_office_finance'] = [val[0][7][1] for val in df_job_posts.sectors]
    df_job_posts['sector_social_health'] = [val[0][8][1] for val in df_job_posts.sectors]
    df_job_posts['sector_other'] = [val[0][9][1] for val in df_job_posts.sectors]

    df_job_posts= df_job_posts.drop('sectors', axis = 1)
    
    return(df_job_posts)

#### Preprocess Danish data

Quarterly

In [4]:
df_DK_job_posts_quarterly = preprocess_job_posts(path_job_posts ='data/job_posts/df_DK_jobs_quarterly.csv', path_labour_force = "data/job_posts/df_DK_labour_force.csv")

In [5]:
# save to csv 
df_DK_job_posts_quarterly.to_csv('data/job_posts/df_DK_jobposts_quarterly_final.csv', index = False)

In [6]:
df_DK_job_posts_quarterly.tail()

Unnamed: 0,date,jobs,ID,labour_force,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other
310,2018-07-01,5034,North Denmark,292000.0,251,378,548,766,806,423,579,601,855,808
311,2018-10-01,4935,North Denmark,289000.0,247,445,590,564,764,480,583,656,871,789
312,2019-01-01,5435,North Denmark,293000.0,275,415,561,763,823,516,489,734,1010,990
313,2019-04-01,5825,North Denmark,290000.0,248,425,560,861,945,512,676,682,912,1095
314,2019-07-01,4248,North Denmark,,231,344,435,559,761,411,436,533,683,634


Monthly

In [152]:
df_DK_job_posts_monthly = preprocess_job_posts(path_job_posts ='data/job_posts/df_DK_jobs_monthly.csv', path_labour_force = "data/job_posts/df_DK_labour_force.csv", monthly = True)

# save to csv 
df_DK_job_posts_monthly.to_csv('data/job_posts/df_DK_jobposts_monthly_final.csv', index = False)

In [153]:
df_DK_job_posts_monthly.tail(10)

Unnamed: 0,date,jobs,ID,labour_force,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other
935,2018-12-01,1360,North Denmark,295000.0,73,148,177,139,207,127,157,200,231,241
936,2019-01-01,1909,North Denmark,297000.0,100,137,190,254,248,189,187,258,360,367
937,2019-02-01,1647,North Denmark,297000.0,85,128,181,241,255,152,161,215,304,308
938,2019-03-01,1879,North Denmark,297000.0,90,150,190,268,320,175,141,261,346,315
939,2019-04-01,1870,North Denmark,,96,161,204,283,304,182,164,225,288,358
940,2019-05-01,2147,North Denmark,,87,148,210,305,322,190,292,250,358,390
941,2019-06-01,1808,North Denmark,,65,116,146,273,319,140,220,207,266,347
942,2019-07-01,1630,North Denmark,,89,134,162,187,287,149,138,195,310,258
943,2019-08-01,1815,North Denmark,,100,153,201,260,331,179,205,237,236,281
944,2019-09-01,803,North Denmark,,42,57,72,112,143,83,93,101,137,95


#### Preprocess Swedish data

Quarterly

In [7]:
df_SE_job_posts_quarterly = preprocess_job_posts(path_job_posts ='data/job_posts/df_SE_jobs_quarterly.csv', path_labour_force = "data/job_posts/df_SE_labour_force.csv")

# save to csv 
df_SE_job_posts_quarterly.to_csv('data/job_posts/df_SE_jobposts_quarterly_final.csv', index = False)

In [8]:
df_SE_job_posts_quarterly.tail()

Unnamed: 0,date,jobs,ID,labour_force,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other
1318,2018-07-01,761,Östergötland,251800.0,991,486,310,741,974,1052,816,899,1200,477
1319,2018-10-01,8112,Östergötland,241800.0,1370,456,348,808,738,1064,1319,1009,1412,576
1320,2019-01-01,7883,Östergötland,243800.0,1249,447,244,970,692,1020,1114,938,1664,493
1321,2019-04-01,715,Östergötland,249400.0,830,380,238,895,567,864,1519,764,1423,402
1322,2019-07-01,3865,Östergötland,251700.0,503,199,156,514,336,550,641,469,806,214


Monthly

In [156]:
df_SE_job_posts_monthly = preprocess_job_posts(path_job_posts ='data/job_posts/df_SE_jobs_monthly.csv', path_labour_force = "data/job_posts/df_SE_labour_force.csv", monthly = True)

# save to csv 
df_SE_job_posts_monthly.to_csv('data/job_posts/df_SE_jobposts_monthly_final.csv', index = False)

In [157]:
df_SE_job_posts_monthly.tail(10)

Unnamed: 0,date,jobs,ID,labour_force,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other
3959,2018-12-01,2220,Östergötland,241600.0,437,160,104,223,158,271,372,276,422,108
3960,2019-01-01,2610,Östergötland,241400.0,429,153,72,380,204,347,318,313,567,150
3961,2019-02-01,2556,Östergötland,241400.0,418,160,80,282,223,334,315,294,531,205
3962,2019-03-01,2717,Östergötland,241400.0,402,134,92,308,265,339,481,331,566,138
3963,2019-04-01,2593,Östergötland,245600.0,319,151,94,394,196,321,516,283,523,127
3964,2019-05-01,2520,Östergötland,245600.0,277,128,84,295,223,306,543,292,516,167
3965,2019-06-01,1902,Östergötland,245600.0,234,101,60,206,148,237,460,189,384,108
3966,2019-07-01,1558,Östergötland,,225,80,70,198,140,242,236,184,293,97
3967,2019-08-01,1643,Östergötland,,193,79,58,242,153,219,279,200,362,85
3968,2019-09-01,664,Östergötland,,85,40,28,74,43,89,126,85,151,32


#### Preprocess Norwegian data

Quarterly

In [158]:
df_NO_job_posts_quarterly = preprocess_job_posts(path_job_posts ='data/job_posts/df_NO_jobs_quarterly.csv', path_labour_force = "data/job_posts/df_NO_labour_force.csv")

# save to csv 
df_NO_job_posts_quarterly.to_csv('data/job_posts/df_NO_jobposts_quarterly_final.csv', index = False)

In [159]:
df_NO_job_posts_quarterly.tail()

Unnamed: 0,date,jobs,ID,labour_force,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other
1192,2018-07-01,1840,Østfold,142000.0,39,89,114,233,288,150,169,130,418,381
1193,2018-10-01,2551,Østfold,140000.0,71,197,222,199,366,184,241,195,658,494
1194,2019-01-01,2452,Østfold,141000.0,96,141,216,241,369,216,246,159,657,430
1195,2019-04-01,1878,Østfold,141000.0,62,111,186,210,227,145,257,110,480,309
1196,2019-07-01,1243,Østfold,,26,72,102,170,129,125,120,90,316,218


Monthly

In [160]:
df_NO_job_posts_monthly = preprocess_job_posts(path_job_posts ='data/job_posts/df_NO_jobs_monthly.csv', path_labour_force = "data/job_posts/df_NO_labour_force.csv", monthly = True)

# save to csv 
df_NO_job_posts_monthly.to_csv('data/job_posts/df_NO_jobposts_monthly_final.csv', index = False)

### Test which sectors to include 

In [3]:
df_merged= pd.read_csv('data/df_analysis_adj.csv', sep = ';', index_col = 0, parse_dates = ['date'])

In [4]:
df_merged = df_merged.sort_values(['ID','date'],ascending=False)

In [8]:
df_merged.columns

Index(['date', 'target_actual', 'ID', 'jobs', 'sector_information_technology',
       'sector_engineering_technology', 'sector_management_staff',
       'sector_trade_service', 'sector_industry_craft',
       'sector_sales_communication', 'sector_teaching',
       'sector_office_finance', 'sector_social_health', 'sector_other', 'GT_0',
       'GT_1', 'GT_2', 'GT_3', 'GT_4', 'GT_5', 'GT_6', 'GT_7', 'GT_8', 'GT_9',
       'GT_10', 'GT_11', 'GT_12', 'GT_13', 'GT_14', 'GT_15', 'GT_16', 'GT_17',
       'GT_18', 'GT_19', 'pop', 'mvu_lvu_share_pop', 'labour_force_share',
       'w_ave_socio_index', 'w_ave_urban_index'],
      dtype='object')

### Correlation tables

In [10]:
df_merged[['jobs','target_actual']].corr()

Unnamed: 0,jobs,target_actual
jobs,1.0,-0.609623
target_actual,-0.609623,1.0


In [11]:
df_merged[['sector_information_technology','target_actual']].corr()

Unnamed: 0,sector_information_technology,target_actual
sector_information_technology,1.0,-0.229136
target_actual,-0.229136,1.0


In [12]:
df_merged[['sector_engineering_technology','target_actual']].corr()

Unnamed: 0,sector_engineering_technology,target_actual
sector_engineering_technology,1.0,-0.441454
target_actual,-0.441454,1.0


In [13]:
df_merged[['sector_management_staff','target_actual']].corr()

Unnamed: 0,sector_management_staff,target_actual
sector_management_staff,1.0,-0.427607
target_actual,-0.427607,1.0


In [14]:
df_merged[['sector_trade_service','target_actual']].corr()

Unnamed: 0,sector_trade_service,target_actual
sector_trade_service,1.0,-0.706581
target_actual,-0.706581,1.0


In [15]:
df_merged[['sector_industry_craft','target_actual']].corr()

Unnamed: 0,sector_industry_craft,target_actual
sector_industry_craft,1.0,-0.733885
target_actual,-0.733885,1.0


In [16]:
df_merged[['sector_sales_communication','target_actual']].corr()

Unnamed: 0,sector_sales_communication,target_actual
sector_sales_communication,1.0,-0.309158
target_actual,-0.309158,1.0


In [17]:
df_merged[['sector_teaching','target_actual']].corr()

Unnamed: 0,sector_teaching,target_actual
sector_teaching,1.0,-0.667923
target_actual,-0.667923,1.0


In [18]:
df_merged[['sector_office_finance','target_actual']].corr()

Unnamed: 0,sector_office_finance,target_actual
sector_office_finance,1.0,-0.463791
target_actual,-0.463791,1.0


In [19]:
df_merged[['sector_social_health','target_actual']].corr()

Unnamed: 0,sector_social_health,target_actual
sector_social_health,1.0,-0.705958
target_actual,-0.705958,1.0


In [20]:
df_merged[['sector_other','target_actual']].corr()

Unnamed: 0,sector_other,target_actual
sector_other,1.0,-0.228513
target_actual,-0.228513,1.0
