In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data/model_data_test.csv')
data = data.drop('Unnamed: 0', axis=1)

In [3]:
data.columns

Index(['closes', 'contract', 'description', 'duration_ad_days', 'employer',
       'enhanced', 'extra_location', 'hours', 'in_uk', 'invalid_code',
       'job_ref', 'job_title', 'jobid', 'location', 'not_student', 'placed_on',
       'prediction', 'prediction_proba', 'run_tag', 'salary', 'salary_max',
       'salary_median', 'salary_min', 'subject_area', 'tags', 'type_role',
       'uk_university'],
      dtype='object')

In [4]:
df = data['tags'].str.replace("'", "").str.replace(']', '').str.replace('[', '').str.replace(' ', '').str.split(',', expand=True)
df.columns = ['tags_1', 'tags_2', 'tags_3']
training = pd.concat([data, df], axis=1, sort=False)
training.drop('tags', axis=1, inplace=True)
#transform_values = {'No': 0, 'Some': 1, 'Most':2, 'InsufficientEvidence': None}
#training.replace(transform_values, inplace=True)


In [5]:

def calculate_score(row):
    """
    This function aggregate the different tags given by people to a job ads into an integrate one, computed with the
    mean and some other calculation to reflect the different possibilities given.
    The different possibilities were:
        1. Insufficient evidence: no enough information to see if the jobs ads requires software development or not
        2. None: no software development required
        3. Some: some software development required
        4. Most: mainly software development
    For the rest it transform the different category into numerical value
        . None: 0
        . Some: 1
        . Most: 2
    From there, the mean is calculated. and from the result the three category are recreated
        . [0, 0.33, 0.5]: None
        . [0.6, 1]: Some
        . > 1: Most
    If there is one insufficient information among the tags, it negative the mean. If there is a consensus of Insufficient
    Information it gives a -10
    
    :params:
        df: containing the columns to compute
    :return:
        a panda Series with the computated mean
    """
    list_values = list()
    insufficient = 0
    for r in row:
        if r == 'No':
            list_values.append(0)
        elif r == 'Some':
            list_values.append(1)
        elif r == 'Most':
            list_values.append(2)
        elif r == 'InsufficientEvidence':
            insufficient +=1
        else:
            pass
    if insufficient > 1:
        return -10
    elif insufficient == 1:
        return -(sum(list_values) / float(len(list_values)))
    else:
        return (sum(list_values) / float(len(list_values)))

        
    
    

In [6]:
training['agg_tag'] = df.loc[:, ['tags_1', 'tags_2', 'tags_3']].apply(calculate_score, axis=1)
training['agg_tag'].value_counts()

 0.000000     104
 1.000000      13
 1.500000      12
 0.666667      10
 0.333333       9
 2.000000       7
-1.000000       5
-2.000000       3
-0.500000       3
-10.000000      2
 0.500000       1
-1.500000       1
Name: agg_tag, dtype: int64

In [7]:
training['agg_val'] = training.loc[:, ['tags_1', 'tags_2', 'tags_3']].mean(axis=1)
sorted(list(training['agg_val'].unique()))

[nan]

In [8]:
training[['run_tag', 'tags_1', 'tags_2', 'tags_3', 'agg_tag', 'prediction', 'prediction_proba']]

Unnamed: 0,run_tag,tags_1,tags_2,tags_3,agg_tag,prediction,prediction_proba
0,first_run,Some,Most,,1.500000,1,0.239954
1,first_run,No,No,,0.000000,0,0.883162
2,first_run,No,No,Some,0.333333,0,0.961392
3,first_run,No,No,,0.000000,0,0.974781
4,first_run,No,No,,0.000000,0,0.974781
5,first_run,No,No,InsufficientEvidence,-0.000000,0,0.963319
6,first_run,No,No,,0.000000,0,0.974781
7,first_run,No,No,,0.000000,0,0.974781
8,first_run,No,No,,0.000000,0,0.971647
9,first_run,No,No,,0.000000,0,0.920206
