## Import

In [55]:
import pandas as pd
import numpy as np
import requests



import os
import sys
currentdir = os.path.dirname(os.path.realpath('analysis_DK'))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir)

from func import global_id, quarter_to_month, reindex, GT_dict

## DK 

#### Target data 

In [56]:
df_dk_target = pd.read_csv(r'data/target/df_DK_target.csv', parse_dates = ['date'])

In [57]:
df_dk_target = df_dk_target.sort_values(by=['ID', 'date'])

In [58]:
df_dk_target.date.min()

Timestamp('2008-01-01 00:00:00')

In [59]:
df_dk_target.date.max()

Timestamp('2019-07-01 00:00:00')

#### Job posts

In [60]:
df_dk_jobs = pd.read_csv(r'data/job_posts/df_DK_jobposts_quarterly_final.csv', parse_dates = ['date'])

In [61]:
df_dk_jobs[df_dk_jobs.date == '2019-01-01'].jobs.sum()

72786

In [62]:
df_dk_jobs = df_dk_jobs.drop(['labour_force'], axis = 1)

Merge on labour force 

In [63]:
df_dk_labour = pd.read_csv(r'data/job_posts/df_DK_labour_force.csv', parse_dates = ['date'])

In [64]:
df_dk_jobs = df_dk_jobs.merge(df_dk_labour, on = ['date', 'ID'])

In [65]:
df_dk_jobs['labour_force'] = df_dk_jobs.groupby('ID').labour_force.shift(1)

Create job rate

In [66]:
jobs_list = ['jobs', 'sector_information_technology', 'sector_engineering_technology','sector_management_staff', 'sector_trade_service',
       'sector_industry_craft', 'sector_sales_communication',
       'sector_teaching', 'sector_office_finance', 'sector_social_health',
       'sector_other']

In [67]:
for col in jobs_list:
    df_dk_jobs[col] = (df_dk_jobs[col] / df_dk_jobs['labour_force'])*100

In [68]:
df_dk_jobs = df_dk_jobs.drop(['labour_force'], axis = 1)

In [69]:
df_dk_jobs.date.min()

Timestamp('2008-01-01 00:00:00')

In [70]:
df_dk_jobs.date.max()

Timestamp('2019-07-01 00:00:00')

#### Google

#### Regional

In [71]:
df_dk_controls = pd.read_csv(r'data/controls/df_DK_controls.csv', parse_dates = ['date'])

In [72]:
df_dk_controls = df_dk_controls.drop(['w_ave_socio_index'], axis = 1)

In [73]:
df_dk_controls = df_dk_controls[df_dk_controls['date'].map(lambda x: x.month).isin([1, 4, 7, 10])]

In [74]:
df_dk_controls.date.min()

Timestamp('2007-04-01 00:00:00')

In [75]:
df_dk_controls.date.max()

Timestamp('2019-07-01 00:00:00')

#### Merge data

In [76]:
df_DK = df_dk_target.merge(df_dk_jobs, on = ['date', 'ID'])
df_DK.shape

(235, 14)

In [77]:
df_DK = df_DK.merge(df_dk_controls, on = ['date', 'ID'])
df_DK.shape

(235, 18)

In [78]:
df_DK.date.min()

Timestamp('2008-01-01 00:00:00')

In [79]:
df_DK[df_DK['jobs'].notnull()]

Unnamed: 0,date,target_actual,ID,jobs,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other,pop,mvu_lvu_share_pop,labour_force_share,w_ave_urban_index
1,2008-04-01,4.13,Capital,3.432511,0.269507,0.229372,0.342377,0.502130,0.298991,0.262332,0.404260,0.754933,0.402130,0.372197,1645825,18.521046,52.812799,97.184792
2,2008-07-01,3.72,Capital,2.837729,0.236353,0.195413,0.294151,0.424885,0.221101,0.221674,0.356995,0.601147,0.338188,0.288073,1648990,19.020249,51.326716,97.226230
3,2008-10-01,3.88,Capital,2.253597,0.179017,0.171703,0.253118,0.277818,0.119424,0.174460,0.297842,0.485731,0.330096,0.235612,1650985,18.997265,52.493338,97.213391
4,2009-01-01,6.18,Capital,1.941019,0.129733,0.139442,0.204126,0.239806,0.083374,0.130947,0.264078,0.399879,0.357403,0.228398,1660042,18.893618,52.154598,97.219624
5,2009-04-01,6.56,Capital,1.766508,0.097506,0.103444,0.180998,0.239667,0.089430,0.126841,0.273397,0.315321,0.289311,0.249525,1662285,18.868124,50.858507,97.251816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,2018-07-01,4.64,Zealand,1.658612,0.064781,0.124936,0.170694,0.223907,0.235219,0.123136,0.235219,0.184576,0.418509,0.211568,835107,13.596821,45.878892,81.492432
231,2018-10-01,5.44,Zealand,1.706443,0.067526,0.125773,0.198454,0.226289,0.214948,0.134278,0.268814,0.195876,0.430412,0.216753,836379,13.576142,45.852419,81.500688
232,2019-01-01,5.64,Zealand,1.949482,0.073834,0.131347,0.202073,0.263731,0.265803,0.148446,0.238601,0.230052,0.516321,0.279016,836694,13.571031,46.022296,81.505010
233,2019-04-01,4.76,Zealand,1.893846,0.057692,0.125385,0.192564,0.270256,0.271795,0.124103,0.277949,0.201282,0.454359,0.289231,836738,13.570317,46.898601,81.516592


In [80]:
df_DK['country'] = 'DK'

In [81]:
df_DK.head()

Unnamed: 0,date,target_actual,ID,jobs,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other,pop,mvu_lvu_share_pop,labour_force_share,w_ave_urban_index,country
0,2008-01-01,3.92,Capital,,,,,,,,,,,,1636749,18.623747,52.882006,97.166662,DK
1,2008-04-01,4.13,Capital,3.432511,0.269507,0.229372,0.342377,0.50213,0.298991,0.262332,0.40426,0.754933,0.40213,0.372197,1645825,18.521046,52.812799,97.184792,DK
2,2008-07-01,3.72,Capital,2.837729,0.236353,0.195413,0.294151,0.424885,0.221101,0.221674,0.356995,0.601147,0.338188,0.288073,1648990,19.020249,51.326716,97.22623,DK
3,2008-10-01,3.88,Capital,2.253597,0.179017,0.171703,0.253118,0.277818,0.119424,0.17446,0.297842,0.485731,0.330096,0.235612,1650985,18.997265,52.493338,97.213391,DK
4,2009-01-01,6.18,Capital,1.941019,0.129733,0.139442,0.204126,0.239806,0.083374,0.130947,0.264078,0.399879,0.357403,0.228398,1660042,18.893618,52.154598,97.219624,DK


In [82]:
df_DK.date.max()

Timestamp('2019-07-01 00:00:00')

In [83]:
df_DK.date.min()

Timestamp('2008-01-01 00:00:00')

#### Save the data 

In [84]:
df_DK.to_csv('data/DK_SE_NO/df_dk.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/DK_SE_NO/df_dk.csv'

## SE

#### Target data 

In [139]:
df_se_target = pd.read_csv(r'data/target/df_SE_target.csv', parse_dates = ['date'])

In [140]:
df_se_target.date.min()

Timestamp('2005-04-01 00:00:00')

In [141]:
df_se_target.date.max()

Timestamp('2019-07-01 00:00:00')

Create target lag

In [142]:
#df_se_target['target_lag'] = df_se_target.target_actual.shift(1)

#### Job posts

In [143]:
df_se_jobs = pd.read_csv(r'data/job_posts/df_SE_jobposts_quarterly_final.csv', parse_dates = ['date'])

In [144]:
df_se_jobs = df_se_jobs.drop(['labour_force'], axis = 1)

Merge on labour force 

In [145]:
df_se_labour = pd.read_csv(r'data/job_posts/df_SE_labour_force.csv', parse_dates = ['date'])

In [146]:
df_se_jobs = df_se_jobs.merge(df_se_labour, on = ['date', 'ID'])

In [147]:
df_se_jobs['labour_force'] = df_se_jobs.groupby('ID').labour_force.shift(1)

Create job rate

In [148]:
jobs_list = ['jobs', 'sector_information_technology', 'sector_engineering_technology','sector_management_staff', 'sector_trade_service',
       'sector_industry_craft', 'sector_sales_communication',
       'sector_teaching', 'sector_office_finance', 'sector_social_health',
       'sector_other']

In [149]:
for col in jobs_list:
    df_se_jobs[col] = (df_se_jobs[col] / df_se_jobs['labour_force'])*100

In [150]:
df_se_jobs = df_se_jobs.drop(['labour_force'], axis = 1)

In [151]:
df_se_jobs.date.min()

Timestamp('2005-04-01 00:00:00')

In [152]:
df_se_jobs.date.max()

Timestamp('2019-07-01 00:00:00')

#### Google

#### Regional

In [153]:
df_SE_controls = pd.read_csv('data/controls/df_SE_controls.csv', parse_dates = ['date'])

In [154]:
df_SE_controls = df_SE_controls[['reg_nr','ID','date','pop', 'labour_force_share','mvu_lvu_share','urban_share']]

In [155]:
df_SE_controls = df_SE_controls.replace('Västra Götaland', 'Västra Götalands')

In [156]:
df_SE_controls.date.min()

Timestamp('2005-01-01 00:00:00')

In [157]:
df_SE_controls.date.max()

Timestamp('2019-07-01 00:00:00')

Shift controls

In [158]:
df_SE_controls['pop'] = df_SE_controls.groupby('ID').pop.shift(5)
df_SE_controls['labour_force_share'] = df_SE_controls.groupby('ID').labour_force_share.shift(1)
df_SE_controls['mvu_lvu_share'] = df_SE_controls.groupby('ID').mvu_lvu_share.shift(6)
df_SE_controls['urban_share'] = df_SE_controls.groupby('ID').urban_share.shift(7)

#### Merge data

In [159]:
df_SE = df_se_target.merge(df_se_jobs, on = ['date', 'ID'])

In [160]:
df_SE = df_SE.merge(df_SE_controls, on = ['date', 'ID'], how = 'left')

In [161]:
df_SE['country'] = 'SE'

In [162]:
#df_SE = df_SE[df_SE.date >= '2007-01-01']

In [163]:
df_SE.date.min()

Timestamp('2005-04-01 00:00:00')

In [164]:
df_SE.date.max()

Timestamp('2019-07-01 00:00:00')

In [165]:
df_SE.shape

(1218, 20)

#### Save the data 

## NO

In [53]:
df_no_target = pd.read_csv(r'data/target/df_NO_target.csv',  parse_dates = ['date'])

#### Target data 

In [54]:
df_no_target.date.min()

Timestamp('1996-01-01 00:00:00')

In [55]:
df_no_target.date.max()

Timestamp('2019-04-01 00:00:00')

Create target lag

In [56]:
df_no_target['target_lag'] = df_no_target.groupby('ID').target_actual.shift(1)

#### Job posts

In [57]:
df_no_jobs = pd.read_csv(r'data/job_posts/df_NO_jobposts_quarterly_final.csv', parse_dates = ['date'])

In [58]:
df_no_jobs = df_no_jobs.drop(['labour_force'], axis = 1)

Correct Nord- Sør trøndelag

In [59]:
test = df_no_jobs[df_no_jobs['ID'].str.startswith('Nord-Trøndelag')]
#test = test.reset_index()

test2 = df_no_jobs[df_no_jobs['ID'].str.startswith('Sør-Trøndelag')]
#test2 = test2.reset_index()


In [60]:
temp = pd.concat([test,test2])
temp = temp.groupby('date').sum()
temp['ID'] ='Trøndelag'

In [61]:
temp = temp.reset_index()

Correct labour force for those two regions

In [62]:
df = pd.read_csv("data/other/df_NO_labour_force.csv",  parse_dates = ['date'])
df = df[['date','labour_force', 'ID']]

In [63]:
test = df[df['ID'].str.startswith('Nord-Trøndelag')]
test = test.reset_index()

test2 = df[df['ID'].str.startswith('Sør-Trøndelag')]
test2 = test2.reset_index()

test3 = df[df['ID'].str.startswith('Trøndelag')]
test3 = test3.reset_index()

In [64]:
temp = pd.concat([test,test2, test3])
temp = temp.groupby('date').sum()
temp['ID'] ='Trøndelag'

In [65]:
temp = temp.reset_index()

In [66]:
temp = temp[['date', 'labour_force', 'ID']]

In [67]:
df= df[~df.ID.isin(['Nord-Trøndelag', 'Sør-Trøndelag', 'Trøndelag'])]

df = pd.concat([df, temp])
df['labour_force'] = df['labour_force']*1000

In [68]:
df_no_jobs = df_no_jobs.merge(df, on = ['date', 'ID'])

In [69]:
df_no_jobs['labour_force'] =  df_no_jobs.groupby('ID').labour_force.shift(1)

Create job rate

In [70]:
jobs_list = ['jobs', 'sector_information_technology', 'sector_engineering_technology','sector_management_staff', 'sector_trade_service',
       'sector_industry_craft', 'sector_sales_communication',
       'sector_teaching', 'sector_office_finance', 'sector_social_health',
       'sector_other']

In [71]:
for col in jobs_list:
    df_no_jobs[col] = (df_no_jobs[col] / df_no_jobs['labour_force'])*100

In [72]:
df_no_jobs = df_no_jobs.drop(['labour_force'], axis = 1)

#### Google

#### Regional

In [73]:
df_NO_controls = pd.read_csv('data/other/df_NO_controls.csv', parse_dates = ['date'])

In [74]:
df_NO_controls.tail()

Unnamed: 0,ID,date,pop,reg_nr,labour_force_share,mvu_lvu_share,urban_share
1219,Trøndelag,2018-10-01,458744,50,70.0,33.7,74.662993
1220,Trøndelag,2019-01-01,464060,50,71.0,,
1221,Trøndelag,2019-04-01,464060,50,70.0,,
1222,Trøndelag,2019-07-01,464060,50,,,
1223,Trøndelag,2019-10-01,464060,50,,,


In [75]:
df_NO_controls = df_NO_controls.replace('Finnmark - Finnmárku', 'Finnmark')
df_NO_controls = df_NO_controls.replace('Troms - Romsa', 'Troms')

Shift controls

In [76]:
df_NO_controls['pop'] = df_NO_controls.groupby('ID').pop.shift(1)
df_NO_controls['labour_force_share'] = df_NO_controls.groupby('ID').labour_force_share.shift(1)
df_NO_controls['mvu_lvu_share'] = df_NO_controls.groupby('ID').mvu_lvu_share.shift(6)
df_NO_controls['urban_share'] = df_NO_controls.groupby('ID').urban_share.shift(4)

In [77]:
df_NO_controls.tail()

Unnamed: 0,ID,date,pop,reg_nr,labour_force_share,mvu_lvu_share,urban_share
1219,Trøndelag,2018-10-01,458744.0,50,70.0,31.15,74.361851
1220,Trøndelag,2019-01-01,458744.0,50,70.0,31.15,74.662993
1221,Trøndelag,2019-04-01,464060.0,50,71.0,31.15,74.662993
1222,Trøndelag,2019-07-01,464060.0,50,70.0,33.7,74.662993
1223,Trøndelag,2019-10-01,464060.0,50,,33.7,74.662993


#### Merge data

In [78]:
df_NO = df_no_target.merge(df_no_jobs, on = ['date', 'ID'])

In [79]:
df_NO = df_NO.merge(df_NO_controls, on = ['date', 'ID'], how = 'left')

In [80]:
df_NO['country'] = 'NO'

In [81]:
df_NO = df_NO[df_NO.date >= '2007-01-01']

In [82]:
df_NO.date.min()

Timestamp('2007-01-01 00:00:00')

In [83]:
df_NO.date.max()

Timestamp('2019-04-01 00:00:00')

#### Save the data 

## Concat the files

In [85]:
df_DK = df_DK.rename(columns = {'pop' : 'population', 'mvu_lvu_share_pop' : 'high_edu_share',
       'w_ave_urban_index' : 'urban_share'})

In [86]:
df_DK = df_DK[['date', 'target_actual', 'ID','jobs',
       'sector_information_technology', 'sector_engineering_technology',
       'sector_management_staff', 'sector_trade_service',
       'sector_industry_craft', 'sector_sales_communication',
       'sector_teaching', 'sector_office_finance', 'sector_social_health',
       'sector_other', 'population', 'high_edu_share', 'labour_force_share',
       'urban_share', 'country']]

In [87]:
df_DK.columns

Index(['date', 'target_actual', 'ID', 'jobs', 'sector_information_technology',
       'sector_engineering_technology', 'sector_management_staff',
       'sector_trade_service', 'sector_industry_craft',
       'sector_sales_communication', 'sector_teaching',
       'sector_office_finance', 'sector_social_health', 'sector_other',
       'population', 'high_edu_share', 'labour_force_share', 'urban_share',
       'country'],
      dtype='object')

In [88]:
df_SE = df_SE.rename(columns = {'pop' : 'population', 'mvu_lvu_share' : 'high_edu_share'})

NameError: name 'df_SE' is not defined

In [167]:
df_SE = df_SE[['date', 'target_actual', 'ID', 'jobs',
       'sector_information_technology', 'sector_engineering_technology',
       'sector_management_staff', 'sector_trade_service',
       'sector_industry_craft', 'sector_sales_communication',
       'sector_teaching', 'sector_office_finance', 'sector_social_health',
       'sector_other', 'population', 'high_edu_share', 'labour_force_share',
       'urban_share', 'country']]

In [168]:
df_SE.columns

Index(['date', 'target_actual', 'ID', 'jobs', 'sector_information_technology',
       'sector_engineering_technology', 'sector_management_staff',
       'sector_trade_service', 'sector_industry_craft',
       'sector_sales_communication', 'sector_teaching',
       'sector_office_finance', 'sector_social_health', 'sector_other',
       'population', 'high_edu_share', 'labour_force_share', 'urban_share',
       'country'],
      dtype='object')

In [90]:
df_NO = df_NO.rename(columns = {'pop' : 'population', 'mvu_lvu_share' : 'high_edu_share',
       'w_ave_urban_index' : 'urban_share'})

In [91]:
df_NO = df_NO[['date', 'target_actual', 'ID', 'target_lag', 'jobs',
       'sector_information_technology', 'sector_engineering_technology',
       'sector_management_staff', 'sector_trade_service',
       'sector_industry_craft', 'sector_sales_communication',
       'sector_teaching', 'sector_office_finance', 'sector_social_health',
       'sector_other', 'population', 'high_edu_share', 'labour_force_share',
       'urban_share', 'country']]

In [92]:
df_NO.columns

Index(['date', 'target_actual', 'ID', 'target_lag', 'jobs',
       'sector_information_technology', 'sector_engineering_technology',
       'sector_management_staff', 'sector_trade_service',
       'sector_industry_craft', 'sector_sales_communication',
       'sector_teaching', 'sector_office_finance', 'sector_social_health',
       'sector_other', 'population', 'high_edu_share', 'labour_force_share',
       'urban_share', 'country'],
      dtype='object')

In [93]:
df_descriptive = pd.concat([df_DK, df_SE, df_NO])

In [98]:
df_descriptive

Unnamed: 0,date,target_actual,ID,target_lag,jobs,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other,population,high_edu_share,labour_force_share,urban_share,country
0,2008-01-01,3.92,Capital,,,,,,,,,,,,,1644835.0,18.532193,54.716735,97.177557,DK
1,2008-04-01,4.13,Capital,3.92,3.432511,0.269507,0.229372,0.342377,0.502130,0.298991,0.262332,0.404260,0.754933,0.402130,0.372197,1645825.0,18.521046,53.468625,97.179641,DK
2,2008-07-01,3.72,Capital,4.13,2.837729,0.236353,0.195413,0.294151,0.424885,0.221101,0.221674,0.356995,0.601147,0.338188,0.288073,1648990.0,19.020249,54.093718,97.218878,DK
3,2008-10-01,3.88,Capital,3.72,2.253597,0.179017,0.171703,0.253118,0.277818,0.119424,0.174460,0.297842,0.485731,0.330096,0.235612,1650985.0,18.997265,52.816955,97.216177,DK
4,2009-01-01,6.18,Capital,3.88,1.941019,0.129733,0.139442,0.204126,0.239806,0.083374,0.130947,0.264078,0.399879,0.357403,0.228398,1660042.0,18.893618,50.239693,97.229164,DK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1049,2018-04-01,5.00,Finnmark,5.13,2.497436,0.046154,0.097436,0.248718,0.315385,0.187179,0.182051,0.400000,0.151282,0.728205,0.441026,76167.0,27.000000,68.000000,77.366742,NO
1050,2018-07-01,2.56,Finnmark,5.00,1.960000,0.067500,0.102500,0.170000,0.237500,0.217500,0.122500,0.167500,0.165000,0.602500,0.385000,76167.0,27.400000,69.000000,77.366742,NO
1051,2018-10-01,5.13,Finnmark,2.56,2.338462,0.053846,0.158974,0.320513,0.233333,0.238462,0.171795,0.256410,0.187179,0.666667,0.369231,76167.0,27.400000,69.000000,77.366742,NO
1052,2019-01-01,7.50,Finnmark,5.13,2.446154,0.087179,0.138462,0.228205,0.279487,0.212821,0.143590,0.335897,0.187179,0.651282,0.469231,76167.0,27.400000,69.000000,77.721323,NO


Secure that all are the same period

In [95]:
df_descriptive = df_descriptive[df_descriptive.date >= df_DK.date.min()]
df_descriptive = df_descriptive[df_descriptive.date <= df_DK.date.max()]

Check the data

In [96]:
df_descriptive.ID.value_counts()

Akershus            46
Aust-Agder          46
Gävleborg           46
Oppland             46
Hordaland           46
Møre og Romsdal     46
Southern Denmark    46
Troms               46
Oslo                46
Kronoberg           46
Värmland            46
Gotland             46
Västernorrland      46
Buskerud            46
Jämtland            46
Sogn og Fjordane    46
Dalarna             46
Södermanland        46
Jönköping           46
Zealand             46
Kalmar              46
Blekinge            46
Telemark            46
Nordland            46
Rogaland            46
Stockholm           46
Västra Götalands    46
Uppsala             46
Norrbotten          46
Skåne               46
North Denmark       46
Halland             46
Hedmark             46
Västmanland         46
Østfold             46
Örebro              46
Västerbotten        46
Östergötland        46
Central Denmark     46
Vestfold            46
Vest-Agder          46
Capital             46
Finnmark            46
Name: ID, d

In [99]:
df_descriptive.country.value_counts()

SE    966
NO    782
DK    230
Name: country, dtype: int64

In [100]:
df_descriptive.isnull().sum()

date                             0
target_actual                    0
ID                               0
target_lag                       5
jobs                             5
sector_information_technology    5
sector_engineering_technology    5
sector_management_staff          5
sector_trade_service             5
sector_industry_craft            5
sector_sales_communication       5
sector_teaching                  5
sector_office_finance            5
sector_social_health             5
sector_other                     5
population                       0
high_edu_share                   0
labour_force_share               0
urban_share                      0
country                          0
dtype: int64

In [101]:
temp = df_descriptive[df_descriptive.date.dt.year == 2010]
temp[temp.urban_share.isnull()]

Unnamed: 0,date,target_actual,ID,target_lag,jobs,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other,population,high_edu_share,labour_force_share,urban_share,country


In [102]:
df_descriptive.date.max()

Timestamp('2019-04-01 00:00:00')

In [103]:
df_descriptive.date.min()

Timestamp('2008-01-01 00:00:00')

Mean table

In [104]:
df_descriptive.groupby('country').mean()

Unnamed: 0_level_0,target_actual,target_lag,jobs,sector_information_technology,sector_engineering_technology,sector_management_staff,sector_trade_service,sector_industry_craft,sector_sales_communication,sector_teaching,sector_office_finance,sector_social_health,sector_other,population,high_edu_share,labour_force_share,urban_share
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
DK,6.513783,6.551067,1.790177,0.100824,0.146816,0.209666,0.244152,0.192992,0.167656,0.20931,0.263178,0.304478,0.258605,1125711.0,15.083701,49.806724,85.299181
NO,3.66812,3.638146,1.77781,0.071445,0.178991,0.117238,0.215437,0.194978,0.279041,0.263173,0.14208,0.521633,0.238751,270495.3,27.150128,70.127877,75.603188
SE,7.527557,7.495859,1.630532,0.125786,0.099581,0.083549,0.235024,0.174969,0.29684,0.29875,0.226633,0.351321,0.138599,457262.0,21.985772,72.430331,80.112908


## GT

In [89]:
dfTrends = pd.read_csv('data/gt/dfTrends.csv', index_col=0, sep = ';', parse_dates=['date'])

In [90]:
dfTrends.drop(['geo', 'trends', 'C_ID'], axis=1, inplace=True)

In [91]:
df_descriptive = df_DK.copy()

In [92]:
df_descriptive = df_descriptive[df_descriptive.date >= '01-01-2007']

In [93]:
df_descriptive.shape

(235, 19)

In [94]:
df_descriptive = pd.merge(df_descriptive, dfTrends, how = 'left', left_on=['ID', 'date'], right_on=['ID', 'date'])

In [95]:
df_descriptive.shape

(235, 59)

### Reindexing GT

In [96]:
#Reindexing trends
df_descriptive = df_descriptive.apply(lambda col: reindex(col) if col.name in GT_dict() else col).copy()

In [97]:
df_descriptive.columns

Index(['date', 'target_actual', 'ID', 'jobs', 'sector_information_technology',
       'sector_engineering_technology', 'sector_management_staff',
       'sector_trade_service', 'sector_industry_craft',
       'sector_sales_communication', 'sector_teaching',
       'sector_office_finance', 'sector_social_health', 'sector_other',
       'population', 'high_edu_share', 'labour_force_share', 'urban_share',
       'country', 'GT_DK_0', 'GT_DK_1', 'GT_DK_2', 'GT_DK_3', 'GT_DK_4',
       'GT_DK_5', 'GT_DK_6', 'GT_DK_7', 'GT_DK_8', 'GT_DK_9', 'GT_DK_10',
       'GT_DK_11', 'GT_DK_12', 'GT_DK_13', 'GT_DK_14', 'GT_DK_15', 'GT_DK_16',
       'GT_DK_17', 'GT_DK_18', 'GT_DK_19', 'GT_DK_20', 'GT_DK_21', 'GT_NO_0',
       'GT_NO_1', 'GT_NO_2', 'GT_NO_6', 'GT_NO_7', 'GT_NO_8', 'GT_NO_10',
       'GT_NO_11', 'GT_SE_0', 'GT_SE_1', 'GT_SE_2', 'GT_SE_5', 'GT_SE_6',
       'GT_SE_7', 'GT_SE_8', 'GT_SE_9', 'GT_SE_10', 'GT_SE_11'],
      dtype='object')

In [98]:
df_descriptive = df_descriptive.drop(['GT_NO_0', 'GT_NO_1', 'GT_NO_2', 'GT_NO_6', 'GT_NO_7', 'GT_NO_8', 'GT_NO_10', 'GT_NO_11', 
                                'GT_SE_0', 'GT_SE_1', 'GT_SE_2', 'GT_SE_5', 'GT_SE_6', 'GT_SE_7', 'GT_SE_8', 'GT_SE_9', 
                                'GT_SE_10', 'GT_SE_11'], axis = 1)

## Exporting

In [99]:
df_descriptive.to_csv('data/descriptive/df_DK_descriptive.csv', index = False)