In [1]:
import re
import glob
import itertools
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.gridspec as gridspec
from sklearn import metrics

from include import features
from common.textClean import textClean

In [2]:
#df = pd.read_csv('data/model_data_test.csv')
df = pd.read_pickle('./data/training_set/training_set.pkl')
#df = df.drop('Unnamed: 0', axis=1)
len(df)

1447

In [3]:
# Splitting the lists into separated columns

# tags
tags = pd.DataFrame(df['tags'].values.tolist())
tags.columns = ['tags_{}'.format(str(int(x)+1)) for x in tags.columns]

In [4]:
#df.drop('tags', axis=1, inplace=True)
# subject area
#subjects = df['subject_area'].str.replace("'", "").str.replace(']', '').str.replace('[', '').str.replace(' ', '').str.split(',', expand=True)
subjects = pd.DataFrame(df['subject_area'].values.tolist())

subjects.columns = ['subject_{}'.format(str(int(x)+1)) for x in subjects.columns]

In [5]:
df = pd.concat([df, tags, subjects], axis=1, sort=False)

# Create a columns with the number of existing tags
df['tag_count'] = df[tags.columns].count(axis=1)
# Drop rows that have only one tag
df = df[df['tag_count'] > 1]

In [6]:
# The prediction are in the way to predict if the job is a NOT software job. Inverse the score to make it easier to read
col_proba = [x for x in df.columns if x[-6:] == '_proba']
for col in col_proba:
    df[col_proba] = df[col_proba].apply(lambda x: 1-x)

# Creating different tags from the training set

In [7]:
def previous_tags(row):
    insuff = 0
    no = 0
    some = 0
    most = 0
    if row[-1] != 'third_run':
        for r in row[:-1]:
            if r == 'No':
                no +=1
            elif r == 'Some':
                some +=1
            elif r == 'Most':
                most +=1
            elif r == 'Insufficient Evidence':
                insuff +=1
            else:
                pass
        if most >= 2:
            return 1
        elif some >=2:
            return 1
        elif no >=2:
            return 0
        elif insuff >=2:
            return 'Insufficient Evidence'
        else:
            return 'Ambiguous'
    else:
        return 'third_run'

In [8]:
def new_tags(row):
    insuff = 0
    no = 0
    some = 0
    most = 0
    for r in row:
        if r == 'No':
            no +=1
        elif r == 'Some':
            some +=1
        elif r == 'Most':
            most +=1
        elif r == 'Insufficient Evidence':
            insuff +=1
        else:
            pass
    if insuff >= 2:
        return 'Insufficient Evidence'
    elif no >=2:
        return 'No'
    elif some >=2:
        return 'Some'
    elif most >= 2:
        return 'Most'
    elif no == 1 and some ==1 and most == 0:
        return 'No'
    elif no ==1 and most ==1 and some == 0:
        return  'Some'
    elif some == 1 and most ==1 and no == 0:
        return 'Some'
    elif no == 1 and some == 1 and most == 1:
        return 'Insufficient Evidence'
    elif insuff ==1 and some ==1 and most ==1:
        return 'Some'
    elif insuff ==1 and no ==1 and most == 1:
        return 'Insufficient Evidence'
    elif insuff ==1 and some ==1 and no ==1:
        return 'No'

In [9]:
def calculate_score(row):
    """
    This function aggregate the different tags given by people to a job ads into an integrate one, computed with the
    mean and some other calculation to reflect the different possibilities given.
    The different possibilities were:
        1. Insufficient evidence: no enough information to see if the jobs ads requires software development or not
        2. None: no software development required
        3. Some: some software development required
        4. Most: mainly software development
    For the rest it transform the different category into numerical value
        . None: 0
        . Some: 1
        . Most: 2
    From there, the mean is calculated. and from the result the three category are recreated
        . [0, 0.33, 0.5]: None
        . [0.6, 1]: Some
        . > 1: Most
    If there is one insufficient information among the tags, it negative the mean. If there is a consensus of Insufficient
    Information it gives a -10
    
    :params:
        df: containing the columns to compute
    :return:
        a panda Series with the computated mean
    """
    list_values = list()
    insufficient = 0
    for r in row:
        if r == 'No':
            list_values.append(0)
        elif r == 'Some':
            list_values.append(1)
        elif r == 'Most':
            list_values.append(2)
        elif r == 'Insufficient Evidence':
            insufficient +=1
        else:
            pass
    if insufficient > 1:
        return -10
    elif insufficient == 1:
        if len(list_values) == 1:
            return -0
        elif len(list_values) == 0:
            return -10
        else:
            return (-(sum(list_values) / float(len(list_values))))/2
    else:
        if len(list_values) == 1:
            return 0
        else:
            return (sum(list_values) / float(len(list_values)))/2

In [10]:
df['agg_tag'] = df.loc[:, ['tags_1', 'tags_2', 'tags_3']].apply(calculate_score, axis=1)
df['agg_tag'].value_counts()

 0.000000     726
 1.000000     112
 0.166667     100
 0.750000      99
 0.500000      97
-0.250000      49
-10.000000     46
 0.333333      45
-0.500000      35
 0.250000      20
-0.750000      12
 0.666667       7
-1.000000       7
Name: agg_tag, dtype: int64

In [11]:
df['agg_created_tag'] = df['agg_tag'].apply(lambda x: 1 if abs(x)>=.5 and abs(x) < 10 else 0)
df['agg_created_tag'].value_counts()

0    986
1    369
Name: agg_created_tag, dtype: int64

In [12]:
# Create a new tag with more category than before, adding insufficient, Some. 
# at the end, there are 4 categories
# 1. Insufficient evidence: score == -10
# 2. No: abs(0.2) < score < abs(0.6)
# 3. Some: 
# 4. Most: abs(0.6) < score <=1

def new_tag_agg(score):
    if score == -10:
        return 'Insufficient'
    elif abs(score) >=0 and abs(score) < 0.2:
        return 'No'
    elif abs(score) >= 0.2 and abs(score) < 0.6:
        return 'Some'
    elif abs(score) > 0.6 and abs(score) <= 1:
        return 'Most'

df['new_tag_agg'] = df.loc[:, 'agg_tag'].apply(new_tag_agg)
df['new_tag_agg'].value_counts()

No              826
Some            246
Most            237
Insufficient     46
Name: new_tag_agg, dtype: int64

In [13]:
df['previous_tags'] = df.loc[:, ['tags_1', 'tags_2', 'tags_3', 'run_tag']].apply(previous_tags, axis=1)
df['previous_tags'].value_counts()

0                        795
1                        215
Ambiguous                200
third_run                100
Insufficient Evidence     45
Name: previous_tags, dtype: int64

In [14]:
df['new_tags'] = df.loc[:, ['tags_1', 'tags_2', 'tags_3']].apply(new_tags, axis=1)
df['new_tags'].value_counts()

No                       893
Some                     253
Most                     126
Insufficient Evidence     71
Name: new_tags, dtype: int64

In [15]:
def corresponding_prev_train(col):
    for c in col:
        try:
            if int(col[0]) == int(col[1]):
                return 1
            else:
                return 0
        except ValueError:
            return col[0]

df['original_tags'] = df.loc[:, ['previous_tags', 'prediction']].apply(corresponding_prev_train, axis=1)

df['original_tags'].value_counts()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


0                        795
1                        215
Ambiguous                200
third_run                100
Insufficient Evidence     45
Name: original_tags, dtype: int64

In [16]:
## Create new tags
df['final_bool_tags'] = np.where(df['agg_tag']>=0.5, 1, 0)
df['final_bool_tags'].value_counts()

0    1040
1     315
Name: final_bool_tags, dtype: int64

# Save the training_set

In [17]:
df.to_pickle('./data/training_set/training_set_mod.pkl')