Load Libraries

In [1]:
import numpy as np
import pandas as pd
import os.path as op
import spacy
nlp = spacy.load('en')

Load Data

In [2]:
DATA_PROCESSED = '../data/processed'

In [3]:
labeled = pd.read_csv(op.join(DATA_PROCESSED, 'labeled_projects.csv'))
all_projects = pd.read_csv(op.join(DATA_PROCESSED, 'cancer_projects_full.csv'))

In [4]:
# will create a dataframe with project ID, and measures of productivity for battle and journey
productivities = []

# group by id
for i, g in labeled.groupby('project_id'):
    # get the number of words total for the project text
    all_words = all_projects.loc[all_projects['id'] == i, 'text_length_words']
    
    # some (~20) projects don't have data on text body size... potential bug
    if all_words.size > 0 and all_words.values[0] > 0:
        
        # battle productivity
        kw = g.loc[g['type'] == 'battle', 'keyword'].value_counts().values
        kw = np.append(kw, all_words.values[0])

        # compute Simpson's Index of Diversity
        tot = sum(kw * (kw-1))
        battle_div = 1 - tot / (sum(kw) * (sum(kw) - 1))
        
        # journey productivity
        kw = g.loc[g['type'] == 'journey', 'keyword'].value_counts().values
        kw = np.append(kw, all_words.values[0])

        # compute Simpson's Index of Diversity
        tot = sum(kw * (kw-1))
        journey_div = 1 - tot / (sum(kw) * (sum(kw) - 1))
    else:
        battle_div, journey_div = 0.0, 0.0
        
    productivities.append([i, battle_div, journey_div])
    
productivities = pd.DataFrame(productivities, columns=['id', 'battle_prod', 'journey_prod'])

Top projects by productivity, after looking at the text of these projects, the measure seems like an accurate representation of diversity in metaphor instantiation.

In [5]:
productivities.nlargest(10, 'journey_prod')

Unnamed: 0,id,battle_prod,journey_prod
118,512019652,0.0,0.063784
377,1746035032,0.0,0.054054
391,1792294790,0.0,0.019417
332,1570902301,0.0,0.016667
201,973883863,0.0,0.016427
311,1470430678,0.0,0.012434
261,1252992481,0.010256,0.010256
55,247063397,0.009901,0.009901
221,1071447964,0.0,0.008511
38,164834653,0.0,0.008306


An example project:

In [6]:
all_projects.loc[all_projects['id'] == 1252992481, 'text'].values[0]

"\xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0 \xa0Shouldn't I be dead by now I am a 68 year old male, who lives on an Australian old age pension because my retirement funds were swallowed up trying to stay alive for the last seven years,with this cancer. I think my story is something that needs to be told, I do not claim to have the cure for cancer as I believe there is no 'one size fits all' in beating cancer. This is one of many mistakes that our medical system makes. But I truly do believe I have found a way and perspective that could very well lead to an end of all cancer. My project is to write the story of my journey through seven years with cancer and how I rejected the Western medical system and go on to discover the true problem of what cancer is and how we get it.\xa0 It is a total change of perspective to the way people are treated, who suffer cancer. After the book is published I want to do lecture tours to explain in greater depth and to answer the many questions I imagine

In [7]:
labeled.loc[labeled['project_id'] == 1252992481]

Unnamed: 0,project_id,fragment,char_location,type,keyword
406,1252992481,laim to have the cure for cancer as I believe ...,367.0,battle,beating
407,1252992481,d very well lead to an end of all cancer. My p...,591.0,journey,journey


Create a copy (so as to not affect the original) and merge

In [8]:
all_projects_c = all_projects.copy(deep=True)

In [9]:
all_projects_c = all_projects_c.merge(productivities, how='left', on='id', validate='one_to_one')

It looks like our measures of diversity are highly correlated with salience:

In [10]:
np.corrcoef(all_projects_c[['battle_prod', 'journey_prod', 'battle_salience', 'journey_salience']], rowvar=False)

array([[1.        , 0.00524885, 0.99994957, 0.00394083],
       [0.00524885, 1.        , 0.00547719, 0.99979795],
       [0.99994957, 0.00547719, 1.        , 0.00417079],
       [0.00394083, 0.99979795, 0.00417079, 1.        ]])