In [1]:
#Import libraries and processing the downloaded files to make them ready for DB

import pandas as pd
import glob
import numpy as np
import json
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import time
import numpy
from multiprocessing import Pool 
sns.set(style="whitegrid")


path = "/home/inlab4/Documents/Dan_datasets/AF/feb/"
file_dump=[]
#print(type(file_dump))
#the file is saved in json format
for f in glob.iglob(path+"*.json"):
    with open(f, "r") as file:
        data = json.load(file)
        date = datetime.date.fromisoformat(f.split('/')[-1].replace(".json", "")[0:10])
        number_ads = len(data)
        summary = {"date": date, "number_ads": number_ads, "file": f, "data": data}
        file_dump.append(summary)

file_dump = sorted(file_dump, key=lambda i: i['date'])
#if scraping date presented in descending order, use range(len(file_dump), 0)
for item in file_dump:
    print("Scraping date: %s    [downloaded_number_of_ads:%d]"%(item['date'], item['number_ads']))
    print("-----------------------------------------")





Scraping date: 2020-02-07    [downloaded_number_of_ads:39053]
-----------------------------------------
Scraping date: 2020-02-11    [downloaded_number_of_ads:377412]
-----------------------------------------
Scraping date: 2020-02-12    [downloaded_number_of_ads:380087]
-----------------------------------------
Scraping date: 2020-02-13    [downloaded_number_of_ads:382552]
-----------------------------------------
Scraping date: 2020-02-14    [downloaded_number_of_ads:385766]
-----------------------------------------
Scraping date: 2020-02-15    [downloaded_number_of_ads:386129]
-----------------------------------------
Scraping date: 2020-02-16    [downloaded_number_of_ads:386901]
-----------------------------------------
Scraping date: 2020-02-17    [downloaded_number_of_ads:389594]
-----------------------------------------
Scraping date: 2020-02-18    [downloaded_number_of_ads:392733]
-----------------------------------------
Scraping date: 2020-02-19    [downloaded_number_of_ads:3

In [12]:
#functions to read in json keys and values into df
#print out the json keys
import numpy
def print_keys(data,ifprint=False):
    #data is a list of dictionaries, some key contain another dictionary
    #i is the index of the dictionary
    simple_keys = []
    complex_keys = []
    for k in (data[0].keys()):
        if isinstance(data[0][k], dict):
            complex_keys.append(k)
            if ifprint:
                print("---%s"%"---".join(data[0][k].keys()))
        else:
            simple_keys.append(k)
            if ifprint:
                print(k)
                
    result = {'simple_keys': simple_keys, "complex_keys": complex_keys}
    return result

            
def get_keyvalue(data, index, keyname):
    #return the value of the keyname at index in data
    #return type is a string
    if isinstance(data[index][keyname], dict):
        return None
    else:
        return data[index][keyname]
    
def get_keyvalues(data, keyname):
    #return a list values in data of the keyname
    #only the simple keys
    result = []
    for item in data:
        try: 
            if item.get(keyname) != None:
                result.append(item[keyname])
            else:
                result.append(np.nan)
        except IndexError as error:
            #the key is not available
            result.append(np.nan)
    
    return result

def get_commonstructure_type(data, ckeyname):
    #with the given compley keyname
    #return values in df, with each subkey as a column
    #some complex keys share the same structure, i.e. the same subkeys
    #create lists of subkeys
    concept_id = []
    label= []
    legacy_ams_taxonomy_id = []
    annons_id = get_keyvalues(data, 'id') #use this as key for matching back to other keys

    for i in range(len(data)):
        #access this complex key as dictionary???
        if data[i] is None:
            legacy_ams_taxonomy_id.append(np.nan)
            label.append(np.nan)
            concept_id.append(np.nan)
        else:
            node= data[i].get(ckeyname)
            if node != None:
                if node.get('concept_id') != None:
                    concept_id.append(node.get('concept_id'))
                else:
                    concept_id.append(np.nan)
                if  node.get('label') != None:
                    label.append(node.get('label'))
                else:
                    label.append(np.nan)
                if node.get('legacy_ams_taxonomy_id') != None:
                    legacy_ams_taxonomy_id.append(node.get('legacy_ams_taxonomy_id'))
                else:
                    legacy_ams_taxonomy_id.append(np.nan)

            else:
                legacy_ams_taxonomy_id.append(np.nan)
                label.append(np.nan)
                concept_id.append(np.nan)

    result = pd.DataFrame({'%s_concept_id'%ckeyname: concept_id,
                          '%s_label'%ckeyname: label,
                          '%s_legacy_ams_taxonomy_id'%ckeyname: legacy_ams_taxonomy_id,
                          'ads_id': annons_id})
    return result


def get_ads_description(data, ckeyname='description'):
    #with the given compley keyname
    #return values in df, with each subkey as a column
    #create lists of subkeys
    text = []
    company_info= []
    needs = []
    requirements = []
    conditions = []
    annons_id = get_keyvalues(data, 'id') #use ad key to match back to other keys

    #get subkeys
    subkeys = data[0].get(ckeyname).keys()
    for i in range(len(data)):
        if data[i] is None:
            text.append(np.nan)
            company_info.append(np.nan)
            requirements.append(np.nan)
            needs.append(np.nan)
            conditions.append(np.nan)
        else:   
            description = data[i].get(ckeyname)
            if description != None:
            
                if description.get('text') != None:
                    text.append(description.get('text'))
                else:
                    text.append(np.nan)
                if description.get('company_info') != None:
                    company_info.append(description.get('company_info'))
                else:
                    company_info.append(np.nan)
                if description.get('needs') != None:
                    needs.append(description.get('needs'))
                else:
                    needs.append(np.nan)
                if description.get('requirements') != None:
                    requirements.append(description.get('requirements'))
                else:
                    requirements.append(np.nan)
                if description.get('conditions') != None:
                    conditions.append(description.get('conditions'))
                else:
                    conditions.append(np.nan)
            else:
                text.append(np.nan)
                company_info.append(np.nan)
                requirements.append(np.nan)
                needs.append(np.nan)
                conditions.append(np.nan)
                
    result = pd.DataFrame({'description_text': text,
                          'description_company_info': company_info,
                          'description_needs': needs,
                          'description_requirements': requirements,
                          'description_conditions': conditions,
                          'ads_id': annons_id})
    return result


def get_employer_values(data, ckeyname='employer'):
    #with the given compley keyname
    #return values in df, with each subkey as a column
    #create lists of subkeys
    #create lists of subkeys
    phone = []
    email = []
    url = []
    orgnr = []
    name = []
    workplace = [] #similar to company name
    annons_id = get_keyvalues(data, 'id') #use ad key to match back to other keys

    #get subkeys
    subkeys = data[0].get(ckeyname).keys()
    for i in range(len(data)):
        if data[i] is None:
            phone.append(np.nan)
            email.append(np.nan)
            url.append(np.nan)
            orgnr.append(np.nan)
            name.append(np.nan)
            workplace.append(np.nan)
        else:   
            employer = data[i].get(ckeyname)
            if employer != None:
            
                if employer.get('phone_number') != None:
                    phone.append(employer.get('phone_number'))
                else:
                    phone.append(np.nan)
                if employer.get('email') != None:
                    email.append(employer.get('email'))
                else:
                    email.append(np.nan)
                if employer.get('url') != None:
                    url.append(employer.get('url'))
                else:
                    url.append(np.nan)
                if employer.get('organization_number') != None:
                    orgnr.append(employer.get('organization_number'))
                else:
                    orgnr.append(np.nan)
                if employer.get('name') != None:
                    name.append(employer.get('name'))
                else:
                    name.append(np.nan)
                if employer.get('workplace') != None:
                    workplace.append(employer.get('workplace'))
                else:
                    workplace.append(np.nan)
            else:
                phone.append(np.nan)
                email.append(np.nan)
                url.append(np.nan)
                orgnr.append(np.nan)
                name.append(np.nan)
                workplace.append(np.nan)
    result = pd.DataFrame({'employer_phone_number': phone,
                          'employer_email': email,
                          'employer_url': url,
                          'employer_organization_number': orgnr,
                          'employer_name': name,
                          'employer_workplace': workplace,
                          'ads_id': annons_id})
    return result

def get_work_addresses(jsondata, ckeyname="workplace_address"):
    #jsondata is a list of dictionaries
    
    municipality_code = []
    municipality = []
    region_code = []
    region = []
    country_code = []
    country = []
    street_address = []
    postcode = []
    city = []
    coordinates = []
    annons_id = get_keyvalues(jsondata, 'id')
    
    subkeys = jsondata[0].get(ckeyname).keys()
    for i in range(len(jsondata)):
        if jsondata[i] is None:
            municipality_code.append(np.nan)
            municipality.append(np.nan)
            region_code.append(np.nan)
            region.append(np.nan)
            country_code.append(np.nan)
            country.append(np.nan)
            street_address.append(np.nan)
            postcode.append(np.nan)
            city.append(np.nan)
            coordinates.append(np.nan)
        else:
            address = jsondata[i].get(ckeyname)
            if address != None:
                subkey_values = []
                for j in subkeys:
                    if address.get(j) != None:
                        subkey_values.append(address.get(j))
                    else:
                        subkey_values.append(np.nan)
            
                municipality_code.append(subkey_values[0])
                municipality.append(subkey_values[1])
                region_code.append(subkey_values[2])
                region.append(subkey_values[3])
                country_code.append(subkey_values[4])
                country.append(subkey_values[5])
                street_address.append(subkey_values[6])
                postcode.append(subkey_values[7])
                city.append(subkey_values[8])
                coordinates.append(subkey_values[9])    
            else:
                municipality_code.append(np.nan)
                municipality.append(np.nan)
                region_code.append(np.nan)
                region.append(np.nan)
                country_code.append(np.nan)
                country.append(np.nan)
                street_address.append(np.nan)
                postcode.append(np.nan)
                city.append(np.nan)
                coordinates.append(np.nan)
    result = pd.DataFrame({'address_municipality_code': municipality_code,
                          'address_municipality': municipality,
                          'address_region_code': region_code,
                          'address_region': region,
                          'address_country_code': country_code,
                          'address_country': country,
                          'address_street_address': street_address,
                          'address_postcode': postcode,
                          'address_city': city,
                          'address_coordinates': coordinates,
                           'ads_id': annons_id})
    return result     

def divide_json(json):
    #remove items with removed==True the json and return the cleaned one
    #re
    loopnr = len(json)-1
    result = []
    for i in range(loopnr):
        if len(json[i].keys()) != 3:
            result.append(json[i])
    return result
                
def convert_json2df(jsondata):
    #api changed json data structure
    simple_keys = ['id', 'external_id','webpage_url','logo_url','headline','application_deadline','number_of_vacancies','salary_description', 'access','experience_required','access_to_own_car','driving_license_required','driving_license','publication_date','last_publication_date','removed','removed_date','source_type','timestamp']
    #keys in the dataframe
    annons_id_removed = []
    annons_id = []
    external_id = []
    webpage_url = []
    logo_url= []
    headline = []
    application_deadline = []
    number_of_vacancies = []
    salary_description = []
    access = []
    experience_required = []
    access_to_own_car = []
    driving_license_required = []
    driving_license = []
    publication_date = []
    last_publication_date = []
    removed = []
    removed_date = []
    removed_removed = []
    removed_date_removed = []
    source_type = []
    timestamp = []

    annons_id = get_keyvalues(jsondata, simple_keys[0])
    #print(simple_keys[1])
    external_id = get_keyvalues(jsondata, simple_keys[1])
    #print(simple_keys[2])
    webpage_url = get_keyvalues(jsondata, simple_keys[2])
    #print(simple_keys[3])
    logo_url = get_keyvalues(jsondata, simple_keys[3])
    #print(simple_keys[4])
    headline = get_keyvalues(jsondata, simple_keys[4])
    #print(simple_keys[5])
    application_deadline = get_keyvalues(jsondata, simple_keys[5])
    #print(simple_keys[6])
    number_of_vacancies = get_keyvalues(jsondata, simple_keys[6])
    #print(simple_keys[7])
    salary_description = get_keyvalues(jsondata, simple_keys[7])
    #print(simple_keys[8])
    access = get_keyvalues(jsondata, simple_keys[8])
    #print(simple_keys[9])
    experience_required=get_keyvalues(jsondata, simple_keys[9])
    #print(print(simple_keys[10]))
    access_to_own_car=get_keyvalues(jsondata, simple_keys[10])
    #print(simple_keys[11])
    driving_license_required=get_keyvalues(jsondata, simple_keys[11])
    #print(simple_keys[12])
    driving_license=get_keyvalues(jsondata, simple_keys[12])
    #print(simple_keys[13])
    publication_date=get_keyvalues(jsondata, simple_keys[13])
    #print(simple_keys[14])
    last_publication_date=get_keyvalues(jsondata, simple_keys[14])
    #print(simple_keys[15])
    removed=get_keyvalues(jsondata, simple_keys[15])
    #print(simple_keys[16])
    removed_date=get_keyvalues(jsondata, simple_keys[16])
    #print(simple_keys[17])
    source_type=get_keyvalues(jsondata, simple_keys[17])
    #print(simple_keys[18])
    timestamp=get_keyvalues(jsondata, simple_keys[18])
            #convert data into df with a flat structure

    df = pd.DataFrame()
    df['ads_id'] = annons_id  #0
    df['external_id'] = external_id #1
    df['webpage_url'] = webpage_url #2
    df['logo_url'] = logo_url #3
    df['headline'] = headline #4
    df['application_deadline'] = application_deadline #5
    df['number_of_vacancies'] = number_of_vacancies #6
    df['number_of_vacancies'].astype(int, errors='ignore')
    df['salary_description'] = salary_description #7
    df['access'] = access #8
    df['experience_required'] = experience_required #9
    df['access_to_own_car'] =access_to_own_car #10
    df['driving_license_required'] = driving_license_required #11
    df['driving_license_required'].astype(numpy.bool)
    df['driving_license'] = driving_license  #12
    df['publication_date'] = publication_date  #13
    df['last_publication_date'] = last_publication_date  #14
    df['removed'] = removed  #15
    df['removed'].astype(numpy.bool)
    df['removed_date'] = removed_date  #16
    df['source_type'] = source_type  #17
    df['timestamp'] = timestamp  #18
    return df

def str2datetime(s, format="%Y-%m-%dT%H:%M:%S"):
    return datetime.datetime.strptime(s, format)

def add_dates2df(df):
    #add year, month, week and day on according to publication date
    #handle NaN in publication date
    df['year']=df['publication_date'].apply(lambda x: np.nan if isinstance(x, pd._libs.tslibs.nattype.NaTType) else (str2datetime(x).strftime("%Y"))) #4digits year
    df['month']=df['publication_date'].apply(lambda x: np.nan if isinstance(x, pd._libs.tslibs.nattype.NaTType) else (str2datetime(x).strftime("%m")))
    df['weekday']=df['publication_date'].apply(lambda x: np.nan if isinstance(x, pd._libs.tslibs.nattype.NaTType) else str(str2datetime(x).isocalendar()[2]))
    df['day']=df['publication_date'].apply(lambda x:np.nan if isinstance(x, pd._libs.tslibs.nattype.NaTType) else (str2datetime(x).strftime('%d')))
    return df


def parallelize_dataframe(df, func, n_cores = 8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def read_one_day_dump(json_dump):
    #read one days data and add dates to the dataframe
    #return the df
    df = convert_json2df(json_dump)
    df.drop_duplicates(subset = ['ads_id'], inplace=True)
    df['removed'] = df['removed'].astype(bool)
    df = df.loc[df.removed==False]
    #add dates to the df
    df = parallelize_dataframe(df, add_dates2df)
    return df

def add_more_data_dump(df1, df2):
    #combine two days data_dumps and remove ads with "True"
    #then drop duplicates
    totaldf = pd.concat([df1, df2])
    totaldf = totaldf.loc[totaldf['removed']==False]
    totaldf.drop_duplicates(subset='ads_id', inplace = True)
    return totaldf

#print("After drop duplicates by ads_id: (%d, %d)"%online_df.shape)
print_keys(file_dump[20].get('data'), ifprint=True)



IndexError: list index out of range

In [None]:
#create aggregated dataset for employer, address, ads_descriptions and save
#them into files
total_df = pd.DataFrame()
for i in range(len(file_dump)):
    #keep only json_records have all keys, delete those have removed
    clean_json = divide_json(file_dump[i].get('data'))
    #read the simple keys, only the online ads
    df = read_one_day_dump(clean_json)
    #read in the complex keys
    #address_df = get_work_addresses(clean_json)
    #description_df = get_ads_description(clean_json)
    employer_df = get_employer_values(clean_json)
    oneday_df = pd.merge(df, employer_df, how="left", on='ads_id')##
    #aggregate day by day data
    total_df = add_more_data_dump(total_df, oneday_df)
    print(f"add {i} day: {df.shape[0]} rows")
    

print(f"total rows: {total_df.shape}")
file_name = "Employer_df_2020-03-19.csv"
print(total_df.shape) #106914, 33
total_df.to_csv("/home/inlab4/Documents/Dan_datasets/AF_results/%s"%file_name)
print(f"saved {file_name}")

In [7]:
#analyse data and derive the cfarNr
#read in data
import pandas as pd
import numpy as np

fdb_dtypes = {'AEAnt': np.int32, 'COAdress': str, 'GatuNr': str, 'Ng1': str, 'PeOrgNr': str, 'JE_orgnr': str, 'CfarNr': str, 'BGatuNr': str, 'BpostNr': str, 'BNg1': str}
fdb = pd.read_csv(r"/media/inlab4/My Passport/Dan/fdb2018/JeAe2018.csv", sep=";", dtype=fdb_dtypes)
print(fdb.head())

address_dtypes={'ads_id': str, 'address_municipality_code': str, 'address_region_code': str, 'address_country_code': str, 'address_postcode': str }
address = pd.read_csv("/home/inlab4/Documents/Dan_datasets/AF_results/Address_df_2020-03-19.csv", dtype=address_dtypes)
print(address.head())
employer_dtypes={'employer_organization_number': str, 'ads_id': str}
employer = pd.read_csv("/home/inlab4/Documents/Dan_datasets/AF_results/Employer_df_2020-03-19.csv", dtype=employer_dtypes)
employer = employer[['ads_id', 'employer_url', 'employer_organization_number', 'employer_name']]
employer_address = address.merge(employer, how="inner", left_on="ads_id", right_on="ads_id")
print(employer_address.head())
employer_address[employer_address['employer_organization_number'].isin('')].value_counts()

  interactivity=interactivity, compiler=compiler, result=result)


   AEAnt  Sektor                          COAdress              Gata  \
0      0  111000             ADVOKATFIRMAN NOVA AB               BOX   
1      0  111000  LINDSKOG MALMSTRÖM ADOKATBYRÅ KB               BOX   
2      0  111000                               NaN     BETTORPSGATAN   
3      0  111000        AMBER ADVOKATER VÄRNAMO HB               BOX   
4      1  111000                BÄCKEGÅRDS LIST AB  BETARP BÄCKEGÅRD   

       GatuNr    Ng1                                             Namn  \
0  55996       00000                            Legislatio Juridik AB   
1  27707       00000                       Hårdvallsgatan Ventures AB   
2  24 C        64202  Restaurangutrustnings Intressenter i Sverige AB   
3  744         00000                          Tattoo Nation Sweden AB   
4              00000                          Betarp Wood Products AB   

       JE_orgnr    CfarNr       PeOrgNr  Ben             BGata     BGatuNr  \
0  165568361462       NaN           NaN  NaN      

  interactivity=interactivity, compiler=compiler, result=result)


   Unnamed: 0    ads_id                        external_id  \
0           0  23840137               46-232100-0040-13796   
1           1  23840142  46-556648-2781-0000007414DC0DDCA1   
2           2  23802672                46-556662-0851-9059   
3           3  23840140  46-556648-2781-0000007412F77DA615   
4           4  23839717                                NaN   

                                         webpage_url  \
0  https://www.arbetsformedlingen.se/For-arbetsso...   
1  https://www.arbetsformedlingen.se/For-arbetsso...   
2  https://www.arbetsformedlingen.se/For-arbetsso...   
3  https://www.arbetsformedlingen.se/For-arbetsso...   
4  https://www.arbetsformedlingen.se/For-arbetsso...   

                                            logo_url  \
0  https://www.arbetsformedlingen.se/rest/arbetsg...   
1  https://www.arbetsformedlingen.se/rest/arbetsg...   
2                                                NaN   
3  https://www.arbetsformedlingen.se/rest/arbetsg...   
4         

TypeError: only list-like objects are allowed to be passed to isin(), you passed a [str]

In [None]:
#print(address.info()), work with AEAnt<=1
#create files public_sector/Private_sector for the AEAnt<=1
#create ads that AEAnt>1 and need more work on cfarnr
fdb['JE_orgnr'] = fdb['PeOrgNr'].apply(lambda x: str(x)[2:12])
print(fdb.shape)
#fdb_1 = fdb.loc[fdb.AEAnt<=1]
fdb_2 = fdb.loc[fdb.AEAnt>1] #97468
orgnrs = fdb_2.JE_orgnr.values #97468
print(fdb_1.shape)
need_cfars = employer_address.loc[employer_address.employer_organization_number.isin(orgnrs)]
print(need_cfars.shape) #60829
#print(fdb.shape)#(2141285, 16)
#print(fdb_1.head())#(1234001, 16)
#print(pdf_2.shape)
#cfar_1 = fdb_1.merge(employer_address, how="inner", left_on='JE_orgnr', right_on='employer_organization_number')
#print(cfar_1.shape) #52822
#print(cfar_1.columns)
#p_sect = ['131110', '131120', '131130', '131311', '131312', '131313', '131321', '131322', '131323', '131400']
#public_sector = cfar_1.loc[cfar_1.Sektor.isin(p_sect)]
#print(public_sector.shape)
print(employer_address.shape)
#public_sector.to_csv("/media/inlab4/My Passport/Dan/AF/public_sector_2020-03-19.csv")
#print("finished saving" )
#private_sector = cfar_1.loc[~cfar_1.Sektor.isin(p_sect)]
#prv_sect = ['111000', '112000', '113000', '114000', '121000', '122100', '122200','122400', '122500', '125200','125300', '125900', '126100', '127000', '128100', '128200', '128300', '129100', '129300', '129400', '141000', '142000']
#private_sector = cfar_1.loc[cfar_1.Sektor.isin(prv_sect)]
#print(private_sector.head())
#print(private_sector.shape) #51528
#private_sector.to_csv("/media/inlab4/My Passport/Dan/AF/private_sector:_2020-03-19.csv")

In [3]:
#function that read one day json file at a time and create a time series for animation vacancies
from matplotlib.animation import FuncAnimation
import matplotlib.pyplot as plt
from matplotlib import animation
from celluloid import Camera
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
%matplotlib qt


fig = plt.figure()
plt.style.use('seaborn-poster')
camera = Camera(fig)
loops = len(file_dump)
#summary is the df of the inread jsonfiles
for i in range(loops):
    if i == 0:
        total = read_one_day_dump(file_dump[i].get('data'))
        print(f"-----------{file_dump[i].get('date')}----------")
        group_sum = total.groupby(['year', 'month'])['number_of_vacancies'].sum()
        annonserdf= pd.DataFrame({'date':group_sum.index.tolist(), 'sum': group_sum.values})
        annonserdf['date']=annonserdf['date'].apply(lambda x: datetime.datetime.strptime("-".join([x[0], x[1]]), "%Y-%m"))
        annonserdf = annonserdf.sort_values(['date'], ascending=True)
        print(annonserdf)
        #days.append(annonserdf['date'].values.tolist())
        #sums.append(annonserdf['sum'].values.tolist())
        df_lists.append(annonserdf)
        
    else:
        #print(f"working on {i} file_dump")
        newdf = pd.DataFrame()
        newdf = read_one_day_dump(file_dump[i].get('data'))
        total = add_more_data_dump(total, newdf)
        print(f"-----------{file_dump[i].get('date')}----------")
        group_sum = total.groupby(['year', 'month'])['number_of_vacancies'].sum()
        annonserdf= pd.DataFrame({'date':group_sum.index.tolist(), 'sum': group_sum.values})
        annonserdf['date']=annonserdf['date'].apply(lambda x: datetime.datetime.strptime("-".join([x[0], x[1]]), "%Y-%m"))
        annonserdf = annonserdf.sort_values(['date'], ascending=True)
        #days.append(annonserdf['date'].values.tolist())
        #sums.append(annonserdf['sum'].values.tolist())
        print(annonserdf)
        df_lists.append(annonserdf)
    #print(annonserdf)
    #f is a list of plt, reuse the same figure
    f = plt.plot(annonserdf['date'], annonserdf['sum'], marker="*")
    plt.legend(f, [file_dump[i].get('date')])
    plt.title("Platsbanken vacancies in month")
    plt.xlabel("Time(month)")
    plt.ylabel("Vacancies")
    plt.xticks(rotation=45)
    camera.snap()

#interval define how fast the animation shows
ani = camera.animate(interval=1000)
#gif file is a animation
#ani.save("/home/inlab4/Documents/AF/feb.gif", writer='imagemagick')
plt.show()


-----------2020-02-07----------
        date    sum
0 2019-08-01    1.0
1 2019-10-01    3.0
2 2019-11-01    4.0
3 2019-12-01   26.0
4 2020-01-01  498.0
-----------2020-02-11----------
        date     sum
0 2019-05-01     1.0
1 2019-08-01    77.0
2 2019-09-01   493.0
3 2019-10-01  1175.0
4 2019-11-01  5128.0
-----------2020-02-12----------
        date     sum
0 2019-05-01     1.0
1 2019-08-01    77.0
2 2019-09-01   493.0
3 2019-10-01  1175.0
4 2019-11-01  5128.0
-----------2020-02-13----------
        date     sum
0 2019-05-01     1.0
1 2019-08-01    77.0
2 2019-09-01   493.0
3 2019-10-01  1175.0
4 2019-11-01  5128.0
-----------2020-02-14----------
        date     sum
0 2019-05-01     1.0
1 2019-08-01    77.0
2 2019-09-01   493.0
3 2019-10-01  1175.0
4 2019-11-01  5128.0
-----------2020-02-15----------
        date     sum
0 2019-05-01     1.0
1 2019-08-01    77.0
2 2019-09-01   493.0
3 2019-10-01  1175.0
4 2019-11-01  5128.0
-----------2020-02-16----------
        date     sum
0 201

In [10]:
from collections import defaultdict
vacancies_bydays = defaultdict(list) #total of each month
dates = []
days = []
differences = defaultdict(list)
for df in df_lists:
    for item in df.itertuples():
        if item.date not in dates:
            dates.append(item.date)
            vacancies_bydays.setdefault(item.date, []).append(item.sum)
            
        else:
            vacancies_bydays[item.date].append(item.sum)
            
print(sorted(dates))
print(vacancies)
for d in range(7, 29):
    days.append(datetime.date(2020, 2, d))


[Timestamp('2019-05-01 00:00:00'), Timestamp('2019-08-01 00:00:00'), Timestamp('2019-09-01 00:00:00'), Timestamp('2019-10-01 00:00:00'), Timestamp('2019-11-01 00:00:00'), Timestamp('2019-12-01 00:00:00'), Timestamp('2020-01-01 00:00:00'), Timestamp('2020-02-01 00:00:00')]
defaultdict(<class 'list'>, {Timestamp('2019-08-01 00:00:00'): [1.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0, 77.0], Timestamp('2019-10-01 00:00:00'): [3.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0, 1175.0], Timestamp('2019-11-01 00:00:00'): [4.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0, 5128.0], Timestamp('2019-12-01 00:00:00'): [26.0, 19883.0, 19883.0, 19883.0, 19883.0, 19883.0, 19883.0, 19883.0, 19883.0, 19883.0, 19883.0, 19883.0, 19883.0, 19883.0,

In [15]:
print_keys(file_dump[10].get('data'), ifprint=True)
print_keys(file_dump[11].get('data'), ifprint=True)

id
external_id
webpage_url
logo_url
headline
application_deadline
number_of_vacancies
---text---company_information---needs---requirements---conditions
---concept_id---label---legacy_ams_taxonomy_id
---concept_id---label---legacy_ams_taxonomy_id
salary_description
---concept_id---label---legacy_ams_taxonomy_id
---concept_id---label---legacy_ams_taxonomy_id
---min---max
access
---phone_number---email---url---organization_number---name---workplace
---information---reference---email---via_af---url---other
experience_required
access_to_own_car
driving_license_required
driving_license
---concept_id---label---legacy_ams_taxonomy_id
---concept_id---label---legacy_ams_taxonomy_id
---concept_id---label---legacy_ams_taxonomy_id
---municipality_code---municipality---region_code---region---country_code---country---street_address---postcode---city---coordinates
---skills---languages---work_experiences
---skills---languages---work_experiences
publication_date
last_publication_date
removed
removed_da

{'simple_keys': ['id',
  'external_id',
  'webpage_url',
  'logo_url',
  'headline',
  'application_deadline',
  'number_of_vacancies',
  'salary_description',
  'access',
  'experience_required',
  'access_to_own_car',
  'driving_license_required',
  'driving_license',
  'publication_date',
  'last_publication_date',
  'removed',
  'removed_date',
  'source_type',
  'timestamp'],
 'complex_keys': ['description',
  'employment_type',
  'salary_type',
  'duration',
  'working_hours_type',
  'scope_of_work',
  'employer',
  'application_details',
  'occupation',
  'occupation_group',
  'occupation_field',
  'workplace_address',
  'must_have',
  'nice_to_have']}

In [None]:
#Use a reference_month for saving ads that remain online after the publication date passed the same month
#i.e. comparing month(publication_date) and month(last_publication_date)
#the result will be a dataframe move intoto the DB, including orgnr, ads_id, publication_date, last_publication_date
#reference_month and vacancies

#print(df.shape)
#print(df.head())
#print(df_removed.shape)
#print(df_removed.head())

def calculate_vacancies(df):
    #check the publication date, last_publication_date and generate newdf
    #with reference_month
    #delete removed ads
    errors = []
    #keep online data
    df = df.loc[df['removed'] == False]
    df.drop_duplicates(subset='ads_id', inplace = True)
    new_df = pd.DataFrame()
    new_ads_id = []
    new_number_of_vacancies = []
    new_publication_date=[]
    new_last_publication_date=[]
    new_timestamp=[]
    reference_month = []
    added_rows = 0
    for row in df.itertuples():
        try: #should be 100% with data, but in case
            publication_date = str2datetime(str(row.publication_date))
            publication_year = publication_date.strftime("%Y")
            last_publication_date = str2datetime(str(row.last_publication_date))
            last_publication_year = last_publication_date.strftime("%Y")
            #If the two dates cover different year, throw the ads
            if publication_year == last_publication_year:
                publication_month =  publication_date.strftime("%m")
                last_month = last_publication_date.strftime("%m")
                if int(publication_month)<int(last_month):
                    duplicates = (int(last_month)-int(publication_month)) +1
                    added_rows = added_rows + (duplicates - 1)
                    for i in range(duplicates):
                        new_ads_id.append(row.ads_id)
                        new_publication_date.append(publication_date)
                        new_last_publication_date.append(last_publication_date)
                        new_timestamp.append(row.timestamp)
                        new_number_of_vacancies.append(row.number_of_vacancies)
                        the_reference_month = int(publication_month) + i
                        reference_month.append(str(the_reference_month))
                
                elif publication_month == last_month:
                    #added the original row
                    new_ads_id.append(row.ads_id)
                    new_publication_date.append(publication_date)
                    new_last_publication_date.append(last_publication_date)
                    new_timestamp.append(row.timestamp)
                    new_number_of_vacancies.append(row.number_of_vacancies)
                    reference_month.append(int(publication_month))
                else:
                    error = publication_date - last_publication_date
                    print(f"ads_id: {row.ads_id} contains {error}")
        except:
            #record the error row
            errors.append(row)

    new_df['ads_id'] = new_ads_id
    new_df['number_of_vacancies'] = new_number_of_vacancies
    new_df['publication_date'] = new_publication_date
    new_df['last_publication_date'] = new_last_publication_date
    new_df['timestamp'] = new_timestamp
    new_df['reference_month'] = reference_month
    print(f"added rows {added_rows}")
    print(df.shape)
    print(new_df.shape)
    return new_df

totaldf = pd.DataFrame()
processed_file = 0
for i in range(len(file_dump)):
    (df, df_removed) = convert_json2df(file_dump[i].get('data'))
    print(f"-----------start file_dump{i}--------------")
    new_df = calculate_vacancies(df)
    totaldf = pd.concat([totaldf, new_df])
    totaldf.drop_duplicates(subset='ads_id', inplace=True)
    processed_file += 1
print(f"finish the data total is {totaldf.shape[0]}")
totaldf.to_csv("/home/inlab4/Documents/Dan_datasets/AF_results/df2020-03-13.csv")
print(f"total processed file {processed_file}")

In [None]:
#column publication_date is datetime now
%matplotlib qt
totaldf['reference_year']= totaldf['publication_date'].apply(lambda x: x.strftime("%Y"))
totaldf['time'] = totaldf[['reference_year', 'reference_month']].apply(lambda x: datetime.datetime.strptime("%Y") )
groups = totaldf.groupby(['time'])['number_of_vacancies'].sum()
groups.plot()
plt.show()

In [None]:
#read in two jsonfiles and compare the increasing number
file_path1 = path+"2020-02-11T08%3A45%3A16.txt"
file_path2 = path+"2020-02-11.txt"
def find_increased(f1, f2):
    # with the given loaded data jsonfile1 and jsonfile2 of two dates, comparing ads_id
    #jsonfile2 loading date is late than jsonfile1
    #return only the increaded
    
    with open(f1, 'r') as f:
        data1 = json.load(f)
    with open(f2, 'r') as f:
        data2 = json.load(f)
    df1 = convert_json2df(data1)
    df2 = convert_json2df(data2)
    online_df1 = df1.loc[df1.removed==False]
    #use total data on date 2, since some may remove 
    merged_ads_outer = pd.merge(online_df1, df2, how='outer', on='ads_id')
    merged_ads_inner = pd.merge(online_df1, df2, how='inner', on='ads_id')
    increased = merged_ads_outer[merged_ads_outer.removed_y==False].shape[0]-merged_ads_inner.shape[0]
    #how about increaed ads
    return increased

def get_increased_ads(f, date):
    #by comparing giving a date, return ads that have bigger publication date
    #f is the file downloaded on one day
    with open(f, "r") as f:
        data = json.load(f)
    df = convert_json2df(data)
    result = df.loc[df.publication_date>=date]
    return result

#print(find_increased(file_path1, file_path2))
#df = get_increased_ads(file_path2, datetime.date.fromisoformat('2020-02-10'))
#df['publication_date'].describe()

In [None]:
#show the number in bar or with map in the background???
#first groupby municipality and then on nuts, finally draw in bars
%matplotlib qt
kommun_lan = pd.read_csv("/home/inlab4/Documents/Dan_datasets/kommun_lan.csv", sep=";", dtype={"Code":str, "nuts2":str})
#print(kommun_lan.head())
def read_daily_data_address(fd):
    #read in the data day by day with simple variabels + address
    #fd is a dictionary structure created in the first cell, when the data were read in
    total = read_one_day_dump(file_dump[0].get('data'))
    #print(total.shape)
    #we cannot see the column removed which is a simple variable, merge with total handle this
    address = get_work_addresses(file_dump[0].get('data'))
    #print(address.shape)
    total = pd.merge(total, address, on='ads_id')
    #print(total.describe())
    return total

totalad = pd.DataFrame()
for i in range(len(file_dump)):
    if i == 0:
        totalad = read_daily_data_address(file_dump[i].get('data'))
    else:
        daydf = read_daily_data_address(file_dump[i].get('data'))
        #add the new data
        totalad = pd.concat([totalad, daydf])
        totalad.drop_duplicates(subset="ads_id", inplace=True)

#check first null value in address_mulnicipality_code and 
print(totalad.info())
print(totalad.address_coordinates.value_counts())
#when all data are read in, use groupby and generate the region vacancies    
#group_sum = totalad.groupby(["address_municipality_code", "year", "month"])['number_of_vacancies'].sum()
#group_sum.sort_index(inplace=True)
#annonserdf = pd.DataFrame()
#annonserdf['municipality_code'] = group_sum.index.get_level_values(0)
#annonserdf['date'] = ["-".join([y, m]) for (y,m) in zip(group_sum.index.get_level_values(1), group_sum.index.get_level_values(2))]
#annonserdf['date'] = pd.to_datetime(annonserdf['date']) #this gives minutes seconds in the end of date
#annonserdf['vacancies'] = group_sum.values
#annonserdf = pd.merge(annonserdf, kommun_lan, left_on='municipality_code', right_on='Code')
#print(annonserdf.info())
#print(annonserdf)
#nuts = ['SE11', 'SE12', 'SE21', 'SE22', 'SE23', 'SE31', 'SE32', 'SE33']
#region_groups = annonserdf.groupby(['date', 'nuts2'])['vacancies'].sum().unstack('nuts2').fillna(0)

#print(region_groups)
#region_groups.plot(kind='bar', stacked=True) 
#plt.title("Total vacancies distributed in regions over Sweden")
#plt.xlabel("Time(month)")
#plt.ylabel("Vacancies")
#plt.xticks(rotation=45)
#plt.show()

In [None]:
%matplotlib qt
import geopandas as gpd
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
countries = world[world['continent']=='Europe']
sweden = countries[countries['name']=='Sweden']
sweden.plot()
print(sweden)

In [None]:
df1 = convert_json2df(file_dump[0].get('data'))
df2 = convert_json2df(file_dump[1].get('data'))
df3 = convert_json2df(file_dump[2].get('data'))
df4 = convert_json2df(file_dump[3].get('data'))
df5 = convert_json2df(file_dump[4].get('data'))
df6 = convert_json2df(file_dump[6].get('data'))
df7 = convert_json2df(file_dump[7].get('data'))
df8 = convert_json2df(file_dump[8].get('data'))
df9 = convert_json2df(file_dump[9].get('data'))

dfs = [df1, df2, df3, df4, df5, df6, df7, df8, df9]
total = dfs[0].loc[dfs[0].removed==False]

for i in range(1, len(dfs)):
    #add only open ads
    #print("data frame is: dfs%d"%i)
    #"remove" of some ads may tunr to True from False
  
    subset = dfs[i].loc[dfs[i].removed==False]
    #print(subset['removed'].describe())
    total = pd.concat([total,subset], ignore_index=True)
    #print(total.shape)

#print(total.head())
before_ads = total.shape[0]
print(total['removed'].describe())
total.drop_duplicates(subset=['ads_id'], inplace=True)
after_ads = total.shape[0]
#only online ads are aggregated, check if this variable is unique False
print(total['removed'].describe())
diff_ads = before_ads-after_ads
print("Total loaded online ads: %d; Total after drop_duplicates: %d; Duplicated ads are: %d"%(before_ads, after_ads, diff_ads))
print(total['removed'].describe())
#download the file into csv for DB
#write down the date of the aggregation, how to do this more efficiently???

In [None]:
start = time.process_time()


total = parallelize_dataframe(total, add_dates2df)
print(total.shape)
print(total.head())
print("execution time is %5.2f"%(time.process_time()-start))
print(total['salary_description'].value_counts())

In [None]:
#show the description of the dataset
online_df = total
print("Total Data shape is (%d, %d)"%online_df.shape)
print("Description of number of vacancies")
print(online_df['number_of_vacancies'].describe())
df_error = online_df.loc[online_df['number_of_vacancies'] == 0]
print(df_error.shape)
if df_error.shape[0]>0:
    #print out text if some vacancies are registered in 0
    print("Number of vacancies reported in 0: %d"%df_error.shape[0])

before = online_df['number_of_vacancies'].sum()
if df_error.shape[0]>0: # if there are 0 in vacancies, handle it
    online_df['number_of_vacancies'] = online_df['number_of_vacancies'].apply(lambda x: 1 if x==0 else (x))
    print("after the controll of number of vacancies")
    total_vacancies = online_df['number_of_vacancies'].sum()
    print("differences after check of number of vacancies: %d"%(total_vacancies-before))

#study the number of vacancies >10
large_vacancies = online_df.loc[online_df['number_of_vacancies']>10]

print("*******************************************")
#if 0 is identified in the dataset, do the following
#online_df['number_of_vacancies'].fillna(0, inplace=True)   
#online_df['number_of_vacancies'].describe()

pdates = online_df['publication_date'].sort_values().tolist()
print(len(pdates))
print("first publication date is : %s"%pdates[0])
print("last publication date is: %s"%pdates[-1])

#is there any differences between last_pblucation_date and application_deadline??
#create a list storing diff in days
diff = []
for row in online_df.itertuples():
    diff_days = (row.last_publication_date - row.application_deadline).days
    if diff_days>0:
        diff.append(diff_days)
if len(diff)>0:    
    print("%d ads found differences between last_publication_date and application_deadline"%len(diff))
    print(diff)

delta = []

for row in online_df.itertuples():
    delta.append((row.application_deadline-row.publication_date).days)
    
online_df['lasting_days'] = delta
#print(online_df.head())
#print(online_df.info())
print("----------------Description of lasting days------------------")
print(online_df['lasting_days'].describe()) #can 
total_vacancies = online_df['number_of_vacancies'].sum()
print("--------Total ads have vacancies: %d--------------"%total_vacancies)
print("Please write the maximum lasting days online of an adertisement allowed to be")
lasting_day = input("")
print("ads lasting more than %s days"%lasting_day)
current_ads = online_df.query('lasting_days<=%d'%int(lasting_day))
print(current_ads.sort_values(by=['publication_date','lasting_days']))
print("------ads lasting max %s days have total vacancies: %d------"%(lasting_day, current_ads['number_of_vacancies'].sum()))
print("------ads lasting max %s days have vacancies in %f of total vacancies"%(lasting_day, current_ads['number_of_vacancies'].sum()/total_vacancies))
print("------ads lasting max %s days have %f of total ads"%(lasting_day, current_ads.shape[0]/online_df.shape[0]))

In [None]:
#check duplicates: first check ads that have lasting days shorter than 30
ads_less30 = online_df.loc[online_df['lasting_days']<=30]
ads_less30_total = ads_less30.shape[0]
ads_less30_vacancies = ads_less30['number_of_vacancies'].sum()
#one outlier has lasting days as 365
#print(data_left.loc[(data_left.year=='2019') & (data_left.month=='05')])

print("Ads lasting less than 30 days has Vacancies percentage {:.2f}".format(ads_less30_vacancies/total_vacancies))
print(ads_less30.head(10))

#domain_list = ads_less30['webpage_url'].apply(lambda x: x.split('/')[2])
#site_list = ads_less30['webpage_url'].apply(lambda x: x.split('/')[-1])
#print(len(set(domain_list)))
#print(site_list[10:15])
#ads_less30['webpage_url'].values.tolist()[2:5]

outlier_ads = online_df.loc[(online_df['lasting_days']<7) | (online_df['lasting_days']>61)]
print("outlier ads shape: (%d, %d)"%outlier_ads.shape)
before = outlier_ads.shape[0]
outlier_ads = outlier_ads.drop_duplicates(subset=['ads_id', 'webpage_url'])
print("differences after the drop duplicates are %d "%(outlier_ads.shape[0]-before))
print("-------------Lasting days description-----------")
print(outlier_ads['lasting_days'].describe())

#merge with employment type and examine how the ads are distributed over the employment type
employer = get_employer_values(file_dump[1].get('data'))
employment_type = get_commonstructure_type(file_dump[1].get('data'), 'employment_type')
duration = get_commonstructure_type(file_dump[1].get('data'), 'duration')
#heltid, deltid
working_hours_type = get_commonstructure_type(file_dump[1].get('data'), 'working_hours_type')
occupation_group = get_commonstructure_type(file_dump[1].get('data'), 'occupation_group')
salary_type = get_commonstructure_type(file_dump[1].get('data'), 'salary_type')

#merge with employer and study how outerliers associate with employer
#outlier_ads = outlier_ads.merge(employer, on='ads_id', how="left")
#print(outlier_ads['employer_workplace'].value_counts())
#print(outlier_ads.groupby(['employer_organization_number']).count().sort_values(by='ads_id', ascending=False).head(10))
#print(outlier_ads[outlier_ads.employer_organization_number=='5590903570'])
#print(outlier_ads.head(10))
#print(outlier_ads.groupby(['experience_required']).count())
#print(outlier_ads.groupby(['month']).count())

#study the outlier with the employment type
#outlier_ads = outlier_ads.merge(employment_type, on='ads_id', how="left")
#print(outlier_ads.groupby(['employment_type_label']).count().sort_values(by="ads_id", ascending=False).head(10))
#print(outlier_ads['employment_type_label'].value_counts())
#outlier_ads = outlier_ads.merge(duration, on='ads_id',how="left")
#print(outlier_ads['duration_legacy_ams_taxonomy_id'].value_counts())
#print(outlier_ads.shape)
#print(outlier_ads.groupby(['duration_label']).count().sort_values(by='ads_id', ascending=False).head(10))

#outlier_ads = outlier_ads.merge(working_hours_type, on='ads_id', how="left")
#print(outlier_ads.groupby(['working_hours_type_label']).count().sort_values(by='ads_id', ascending=False).head(10))
#print(outlier_ads['working_hours_type_legacy_ams_taxonomy_id'].value_counts())
outlier_ads = outlier_ads.merge(occupation_group, on='ads_id', how='left')
print(occupation_group['occupation_group_legacy_ams_taxonomy_id'].value_counts())
print(outlier_ads.groupby(['occupation_group_label']).count().sort_values(by='ads_id', ascending=False).head(10))

#online_ads = pd.merge(online_df, salary_type, on='ads_id', how='left')
#print(online_ads.groupby(['salary_type_label']).count().sort_values(by='ads_id', ascending=False).head(10))
#print(online_ads['salary_description'].describe())

In [None]:
#visulization of totals vacancies
print("Please write 'month' or 'day' for dawing total vacancies")
group_unit=input()

if group_unit=='day':
    #summary of daily vacancies
    group_sum = online_ads.groupby(['year', 'month', 'day'])['number_of_vacancies'].sum()
    #print(group_sum)
    #convert a panda group multiindex into a list
    annonserdf= pd.DataFrame({'date':group_sum.index.tolist(), 'sum': group_sum.values})
    annonserdf['date'] = annonserdf['date'].apply(lambda x: datetime.date.fromisoformat('-'.join([x[0], x[1], x[2]])))
    annonserdf = annonserdf.sort_values(['date'], ascending=True)
    print(annonserdf.head())
    print(annonserdf.describe())
    print("---check if some days have no vacancies---")
    print(annonserdf.loc[annonserdf['sum']==0.0])
    ax = annonserdf.plot(x='date', y='sum', marker='*')
    fig = ax.get_figure()
    fig.savefig("/home/inlab4/Documents/AF/graphs/vacancies_daily.jpg")
elif group_unit=='month':
    group_sum = online_ads.groupby(['year', 'month'])['number_of_vacancies'].sum()
    #print(group_sum)
    #convert a panda group multiindex into a list
    annonserdf= pd.DataFrame({'date':group_sum.index.tolist(), 'sum': group_sum.values})
    annonserdf['date']=annonserdf['date'].apply(lambda x: datetime.datetime.strptime("-".join([x[0], x[1]]), "%Y-%m"))
    annonserdf = annonserdf.sort_values(['date'], ascending=True)
    print(annonserdf.head())
    print(annonserdf.describe())
    print("---check if some days have no vacancies---")
    print(annonserdf.loc[annonserdf['sum']==0.0])
    ax = annonserdf.plot(x='date', y='sum', marker='*')
    fig = ax.get_figure()
    fig.savefig("/home/inlab4/Documents/AF/graphs/vacancies_monthly.jpg")
else:
    print("Please give a group unit in 'month' or 'day'")



In [None]:
description_text = []
ansok_url = []
import re
urlpattern = re.compile('\[http\S\]')
for item in data:
    if item.get('description') != None:
        if item['description'].get('text') != None:
            description_text.append(item['description']['text'])
            
            url = urlpattern.findall(item['description']['text'])
                break
            if url:
                ansok_url.append(url[0])
            else:
                ansok_url.append('NaN')
    else:
        description_text.append('NaN')
        ansok_url.append('NaN')

print(len(description_text))
print(description_text[100])
print(ansok_url)
s = "39053
KVALIFIKATIONER 

• Sjuksköterskeexamen med minst 1års yrkeserfarenhet som skolsköterska.
 • VUB inom område
 • Mycket goda kunskaper i Svenska, både i tal och skrift.
 • Meriterandemed erfarenhet avvaccination 

ANSÖKAN 

Du är varmt välkommen att skicka in din ansökan till oss. Ansök genom att klicka på knappen ”Ansök här”. Om du som sökande har frågor om den utannonserade tjänsten var vänlig och kontakta kontaktpersonen för denna annons. På vårhemsida [https://www.dedicare.se/yrkesroll/sjukskoterska/]presenterar vi alla våra lediga uppdrag. Urval och intervjuer sker löpande, vi tar tacksamt emot din ansökan snarast. "

In [None]:
concept_id = []
label= []
legacy_ams_taxonomy_id = []

for item in data1[0:3]:
    employment_type = item.get('employment_type')
    if isinstance(employment_type, dict):
        concept_id.append(employment_type.get('concept_id'))
        label.append(employment_type.get('label'))
        legacy_ams_taxonomy_id.append(employment_type.get('legacy_ams_taxonomy_id'))

    else:
        concept_id.append(np.nan)
        label.append(np.nan)
        legacy_ams_taxonomy_id.append(np.nan)
        
print(concept_id)
print(label)
print(legacy_ams_taxonomy_id)
        

In [None]:
group_sum = data_left.groupby(['year', 'month'])['number_of_vacancies'].sum()
#print(group_sum)
#convert a panda group multiindex into a list
annonserdf= pd.DataFrame({'date':group_sum.index.tolist(), 'sum': group_sum.values})
annonserdf['date']=annonserdf['date'].apply(lambda x: datetime.datetime.strptime("-".join([x[0], x[1]]), "%Y-%m"))
annonserdf = annonserdf.sort_values(['date'], ascending=True)
print(annonserdf.head())
print(annonserdf.describe())
print("---check if some days have no vacancies---")
print(annonserdf.loc[annonserdf['sum']==0.0])
ax = annonserdf.plot(x='date', y='sum', marker='*')

In [None]:
date1 = 3
date2 = 7
for i in range(4):
    print(i)

In [None]:
import requests

html = requests.get("http://www.coopans.com", timeout=5, headers={})
if html.status_code == 200:
    f = open('/home/inlab/CISNLP/Dan/coopans.html', 'w+')
    f.write(page_html.text)
    f.close()
    print("created file")
else:
    print("error")
    
    

In [None]:
df_test = pd.DataFrame({'Year': ['2014', '2015'], 'quarter': ['q1', 'q2']})
df_test['period'] = df['Year'], 'quarter']].apply(lambda x: ''.join(x), axis=0)
print(df_test)


In [None]:
date = datetime.datetime.strptime('2020-03-31T23:59:59', '%Y-%m-%dT%H:%M:%S')
print(date)
print(type(date))
print(date.strftime("%Y"))

In [None]:
s1 = "hi, i am david"

print(s1[4: len(s1)])