In [11]:
import os
import json
import pandas as pd
from pandas2arff import pandas2arff
from sklearn.preprocessing import LabelEncoder

def printDF(title, df):
    print "##############################\n    "+title+"    \n##############################\n"
    print "## Shape: ##"
    print df.shape
    print "\n## Missing Values per Column: ##"
    print df.isnull().sum()
    # print "\n## Show data: ##"
    # print df[0:12]
    print "############################## \n\n"

def createDF(file_name):
    # load data from json file
    #working_dir = os.path.dirname(os.path.realpath(__file__)) + '/'
    with open(file_name, "r") as f:
        found_jobs = f.read()
    data = json.loads(found_jobs)

    # normalize because of nested client data
    df = pd.io.json.json_normalize(data)
    df.columns = [c.replace('.', '_') for c in df.columns] # so we can access a column with "data_frame.client_reviews_count"
    return df

In [158]:
data_frame = createDF("found_jobs_4K.json")

printDF("Before changing data", data_frame)

##############################
    Before changing data    
##############################

## Shape: ##
(4779, 19)

## Missing Values per Column: ##
budget                                 588
category2                                0
client_country                           0
client_feedback                          0
client_jobs_posted                       0
client_past_hires                        0
client_payment_verification_status    3106
client_reviews_count                     0
date_created                             0
duration                              2730
id                                       0
job_status                               0
job_type                                 0
skills                                   0
snippet                                  0
subcategory2                             0
title                                    0
url                                      0
workload                               997
dtype: int64
####################

## Remove attributes which have as many values as there are examples

![Useless attributes](images/useless_attributes.jpg)

Removing the `id` attribute could however be problematic, since there it could be useful later, when trying to find similar jobs

In [159]:
unnecessary_columns = ["id", "category2", "job_status", "url"]
data_frame.drop(labels=unnecessary_columns, axis=1, inplace=True)

## Remove attributes which don't add much value but which have too many missing values

In [160]:
bad_columns = ["client_payment_verification_status"]
data_frame.drop(labels=bad_columns, axis=1, inplace=True)

## Remove examples which contribute missing values to attributes with < 30 missing attributes

In [161]:
max_few_missing = 30
columns_few_missing = list(data_frame.columns[data_frame.isnull().sum() < max_few_missing])
data_frame = data_frame.dropna(subset=columns_few_missing, how='any')

## Set feedback to None on examples where the client has not yet received a review

In [162]:
data_frame.loc[data_frame.client_reviews_count == 0, 'client_feedback'] = None

## Set budget to None if it is 0 (jobs without a budget cannot exist)

In [163]:
data_frame.loc[data_frame.budget == 0, 'budget'] = None

## Problems with `duration` and `workload`

Both attributes `duration` and `workload` have too many missing values, which makes filling the missings near impossible. As soon as we drop all examples where budget is missing, we get the following statistics for budget, duration and workload:

![Statistics for budget, duration and workload](images/budget_duration_workload.jpg)

After removing examples, where workload is missing, we get the following statistics:

![Statistics without missing values for workload](images/no_missing_workloads.jpg)

The problem here is, that even though no examples with workload missing are present, the workload for all examples is "30 hrs/ week".

If we drop examples, where duration is missing instead of workload, the resulting dataset contains no examples.

This leads to the conclusion, that both `duration` and `workload` should be dropped entirely.

In [164]:
printDF("Before dropping workload and duration", data_frame)

##############################
    Before dropping workload and duration    
##############################

## Shape: ##
(4779, 14)

## Missing Values per Column: ##
budget                  2049
client_country             0
client_feedback          133
client_jobs_posted         0
client_past_hires          0
client_reviews_count       0
date_created               0
duration                2730
job_type                   0
skills                     0
snippet                    0
subcategory2               0
title                      0
workload                 997
dtype: int64
############################## 




In [165]:
data_frame = data_frame.dropna(subset=['budget'], how='all')
data_frame = data_frame.dropna(subset=['client_feedback'], how='all')
data_frame.drop(labels=['duration', 'workload'], axis=1, inplace=True)

## Convert nominal attributes to numerical

This is done to make them available to all regression models - especially models that would not accept nominal attributes. Since the LabelEncoder cannot handle missing values and would therefore just assign an additional numerical value to missing values, we store the missing values for `duration` and `workload`, apply the LabelEncoder and afterwards set the values which should be missing to None again.

The following attributes seem interesting for this conversion:
![Nominal attributes to be converted](images/nominal_to_numeric.jpg)

In [166]:
clientCountryEncoder = LabelEncoder()
data_frame['client_country'] = clientCountryEncoder.fit_transform(data_frame['client_country'].astype('str'))
jobTypeEncoder = LabelEncoder()
data_frame['job_type'] = jobTypeEncoder.fit_transform(data_frame['job_type'].astype('str'))
subcategory2Encoder = LabelEncoder()
data_frame['subcategory2'] = subcategory2Encoder.fit_transform(data_frame['subcategory2'].astype('str'))

## Convert the `date_created` attribute into a delta between the date created and the time now and rename it to `posting_age`

In [167]:
data_frame['date_created'] = pd.to_numeric(pd.to_timedelta(pd.to_datetime(
    data_frame['date_created'])).dt.days)
data_frame.rename(columns={'date_created': 'timestamp'}, inplace=True)

In [168]:
for i, row in data_frame.head(5).iterrows():
    print row['timestamp']

17295
17292
17290
17290
17290


In [169]:
printDF("After changing data", data_frame)

##############################
    After changing data    
##############################

## Shape: ##
(2642, 12)

## Missing Values per Column: ##
budget                  0
client_country          0
client_feedback         0
client_jobs_posted      0
client_past_hires       0
client_reviews_count    0
timestamp               0
job_type                0
skills                  0
snippet                 0
subcategory2            0
title                   0
dtype: int64
############################## 




In [145]:
pandas2arff(data_frame, "jobs.arff", wekaname = "jobs", cleanstringdata=True, cleannan=True)

True