In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 227 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 29.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 31.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting 

In [None]:
import pandas as pd
import ast 
from collections import defaultdict, Counter
from transformers import pipeline
from tqdm import tqdm 
import numpy as np

# Task 2: Occupation mapping



## Context

Given the noisiness of data, compartmentalizing and categorizing it may be a good solution to give better results. Thus, one of the information that could allow us to do so is the person's jobs and occupations which we could find in wikidata.<br>
In this task, we categorize speakers by wikidata's occupation first, then, given that this data is extremely sparse, we create a pool of occupations to which we assign the original ones, in the same way topic matching is done.

## Mapping

We start off by looking for each speaker's occupations.

In [None]:
# Read wikidata obtained from the TAs
wikidata_labels = pd.read_csv('/content/drive/MyDrive/Shared_ADA/M3/data/Project datasets/wikidata_labels_descriptions_quotebank.csv.bz2', compression='bz2', index_col='QID')

In [None]:
# Read bitcoin quotes
quotes = pd.read_csv("/content/drive/MyDrive/Shared_ADA/M3/data/bitcoin_data.csv")

In [None]:
# Get set of speakers having quotes about bitcoin
all_speaker_ids = set()
for speaker_ids in quotes['qids']:
    speaker_ids = ast.literal_eval(speaker_ids)
    all_speaker_ids.update(speaker_ids)
print("We have {} speakers in total".format(len(all_speaker_ids)))

We have 9477 speakers in total


In [None]:
# Read speaker data
speakers = pd.read_parquet('/content/drive/MyDrive/Shared_ADA/M3/data/Project datasets/speaker_attributes.parquet', engine='pyarrow')

In [None]:
# Keep data of speakers having quotes about bitcoin
speakers = speakers[speakers['id'].isin(all_speaker_ids)]

In [None]:
speakers.head()

Unnamed: 0,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,id,label,candidacy,type,religion
18,"[Namo, Modi, Narendra Bhai, Narendra Damodarda...",[+1950-09-17T00:00:00Z],[Q668],[Q6581097],1395415052,"[Q1282294, Q6889284]",,"[Q82955, Q36180, Q7019111, Q10429346]",[Q10230],,Q1058,Narendra Modi,"[Q63988227, Q63988919, Q16841231, Q16251961, Q...",item,[Q9089]
222,[Stephen Gerald Breyer],[+1938-08-15T00:00:00Z],[Q30],[Q6581097],1393110898,,,"[Q185351, Q16533, Q40348, Q1622272, Q82955]",[Q29552],,Q11124,Stephen Breyer,,item,[Q9268]
347,[Rebecca Renee Black],[+1997-06-21T00:00:00Z],[Q30],[Q6581072],1391856171,,,"[Q177220, Q33999, Q55960555, Q2405480, Q171252...",,,Q18804,Rebecca Black,,item,
371,"[Steven Paul Jobs, Steven Jobs]",[+1955-02-24T00:00:00Z],[Q30],[Q6581097],1382097321,,,"[Q131524, Q205375, Q81096, Q5322166, Q82594, Q...",[Q29552],,Q19837,Steve Jobs,,item,"[Q748, Q7953]"
400,"[Jimmy E. Carter, James Earl Carter Jr., James...",[+1924-10-01T00:00:00Z],[Q30],[Q6581097],1393268206,,,"[Q10669499, Q193391, Q6625963, Q82955, Q131512...",[Q29552],,Q23685,Jimmy Carter,"[Q699693, Q7892708]",item,[Q93191]


In [None]:
speakers_having_occupation = speakers.dropna(subset=['occupation'])
occupation_set = set()
for occupation in speakers_having_occupation['occupation']:
  occupation_set.update(occupation.tolist())
print("We have {} occupations in total".format(len(occupation_set)))


We have 892 occupations in total


As you can see, we have 892 occupations and 9477 speakers. You can easily imagine that with an amount of occupations that's nearly 10% the number of speakers, it will be difficult to get any results, so we do the occupation pool matching that we spoke about above.

We choose the pool of occupations based on https://www.indeed.com/career-advice/finding-a-job/careers-by-field that lists the principal fields. You will see later that it doesn't always work and we'll handle it.

In [None]:
# The list is based on https://www.indeed.com/career-advice/finding-a-job/careers-by-field
labels=["Architecture and engineering",
        "Arts, culture and entertainment",
        "Business, management and administration",
        "Communications",
        "Community and social services",
        "Education",
        "Science and technology",
        "Government",
        "Health and medicine",
        "Law and public policy",
        "Sport"]

We use the model below to match the intial occupations with the chosen pool of occupations.

In [None]:
classifier = pipeline('zero-shot-classification',
                      model='facebook/bart-large-mnli')
hypothesis_template = 'This text is about {}.' # the template we use

Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
# Get the occupations appearing in the bitcoin data
wikidata_occupation_labels = wikidata_labels.loc[list(occupation_set)]
wikidata_occupation_labels.head()

Unnamed: 0_level_0,Label,Description
QID,Unnamed: 1_level_1,Unnamed: 2_level_1
Q12013238,ophthalmologist,physician who practices ophthalmology
Q1797162,artistic director,artistic leader of a cultural institution
Q16323111,peace activist,activist focused on avoiding war
Q26237722,rugby sevens player,
Q50214236,Australian rules football coach,


In [None]:
fields = []
scores = []
for i in tqdm(range(len(wikidata_occupation_labels))):
    try:
      # Input is the description of occupation, list of possible output
      res = classifier(wikidata_occupation_labels.iloc[i]['Description'], labels,
            hypothesis_template=hypothesis_template,
            multi_class=True)
      # res['labels'], res['scores'] are 2 lists
      res = dict(zip(res['labels'], res['scores']))
      res = max(res.items(), key=lambda x:x[1])
      fields.append(res[0])
      scores.append(res[1])
    except:
      # Some ids don't have descriptions
      fields.append('None')
      scores.append(0)     

  0%|          | 0/892 [00:00<?, ?it/s]The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.
  0%|          | 1/892 [00:05<1:18:46,  5.30s/it]The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.
  0%|          | 2/892 [00:08<1:04:25,  4.34s/it]The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.
  0%|          | 3/892 [00:13<1:02:53,  4.24s/it]The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.
  0%|          | 4/892 [00:18<1:08:37,  4.64s/it]The `multi_class` argument has been deprecated and renamed to `multi_label`. `multi_class` will be removed in a future version of Transformers.
  1%|          | 5/892 [00:21<1:01:56,  4.19s

PS: Due to the lack of time to run everything from scratch as this model takes time, we directly use the results that we've registered from a previous run. However, you can check the results by running the notebook.

In [None]:
wikidata_occupation_labels['Field'] = fields 
wikidata_occupation_labels['Score'] = scores 

In [None]:
wikidata_occupation_labels.head()

Unnamed: 0_level_0,Label,Description,Field,Score
QID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q11774202,essayist,person who writes essays,"Business, management and administration",0.291987
Q15462162,cultural historian,humanist who is engaged in cultural history,"Arts, culture and entertainment",0.079204
Q733786,monk,member of a monastic religious order,Community and social services,0.493978
Q978044,executive,higher level corporate position generally char...,"Business, management and administration",0.884265
Q20850090,harmonicist,person who plays a harmonica,"Arts, culture and entertainment",0.68879


In [None]:
wikidata_occupation_labels.to_csv('/content/drive/MyDrive/Shared_ADA/M3/data/occupation_wikidata_labels_descriptions_quotebank.csv', index=True)

After this matching, there are some speakers which occupation mapping score is lower than 0.3. We assumed that this score is low enough to make out data very noisy, and as there were not more than 200 data points with a score lower than 0.3, we filtered these predictions and manually verified them (changing them if necessary).

In [None]:
wikidata_occupation_labels = pd.read_excel('/content/drive/MyDrive/Shared_ADA/M3/data/occupation_wikidata_labels_descriptions_quotebank.xlsx')
wikidata_occupation_labels = wikidata_occupation_labels.dropna(subset=['QID']).set_index('QID')

In [None]:
# Convert ids to fields
def convert_fields(occupation_ids):
  if type(occupation_ids)!=np.ndarray:
    return occupation_ids
  fields = []
  for occupation_id in occupation_ids:
    try:
      field = wikidata_occupation_labels.loc[occupation_id]['Refined Field']
      if not pd.isna(field):
        fields.append(field)
    except:
      pass 
  return list(set(fields))

# Convert ids to occupations
def convert_occupations(occupation_ids):
  if type(occupation_ids)!=np.ndarray:
    return occupation_ids
  occupations = []
  for occupation_id in occupation_ids:
    try:
      occupation = wikidata_occupation_labels.loc[occupation_id]['Label']
      if not pd.isna(occupation):
        occupations.append(occupation)
    except:
      pass 
  return list(set(occupations))

In [None]:
speakers['fields'] = speakers['occupation'].apply(convert_fields)
speakers['occupations'] = speakers['occupation'].apply(convert_occupations)
speakers = speakers.set_index('id')

In [None]:
speakers.to_csv('/content/drive/MyDrive/Shared_ADA/M3/data/speakers.csv')

In [None]:
speakers.head()

Unnamed: 0,id,aliases,date_of_birth,nationality,gender,lastrevid,ethnic_group,US_congress_bio_ID,occupation,party,academic_degree,label,candidacy,type,religion,fields,occupations
0,Q1058,['Namo' 'Modi' 'Narendra Bhai' 'Narendra Damod...,['+1950-09-17T00:00:00Z'],['Q668'],['Q6581097'],1395415052,['Q1282294' 'Q6889284'],,['Q82955' 'Q36180' 'Q7019111' 'Q10429346'],['Q10230'],,Narendra Modi,['Q63988227' 'Q63988919' 'Q16841231' 'Q1625196...,item,['Q9089'],"['Government', 'Communications', 'Community an...","['social worker', 'politician', 'bibliographer..."
1,Q11124,['Stephen Gerald Breyer'],['+1938-08-15T00:00:00Z'],['Q30'],['Q6581097'],1393110898,,,['Q185351' 'Q16533' 'Q40348' 'Q1622272' 'Q82955'],['Q29552'],,Stephen Breyer,,item,['Q9268'],"['Education', 'Government', 'Law and public po...","['politician', 'jurist', 'judge', 'university ..."
2,Q18804,['Rebecca Renee Black'],['+1997-06-21T00:00:00Z'],['Q30'],['Q6581072'],1391856171,,,['Q177220' 'Q33999' 'Q55960555' 'Q2405480' 'Q1...,,,Rebecca Black,,item,,"['Arts, culture and entertainment']","['recording artist', 'actor', 'YouTuber', 'sin..."
3,Q19837,['Steven Paul Jobs' 'Steven Jobs'],['+1955-02-24T00:00:00Z'],['Q30'],['Q6581097'],1382097321,,,['Q131524' 'Q205375' 'Q81096' 'Q5322166' 'Q825...,['Q29552'],,Steve Jobs,,item,['Q748' 'Q7953'],"['Arts, culture and entertainment', 'Science a...","['designer', 'computer scientist', 'engineer',..."
4,Q23685,['Jimmy E. Carter' 'James Earl Carter Jr.' 'Ja...,['+1924-10-01T00:00:00Z'],['Q30'],['Q6581097'],1393268206,,,['Q10669499' 'Q193391' 'Q6625963' 'Q82955' 'Q1...,['Q29552'],,Jimmy Carter,['Q699693' 'Q7892708'],item,['Q93191'],"['Science and technology', 'Arts, culture and ...","['human rights activist', 'writer', 'diplomat'..."


Now that we're done doing occupation matching, we can start leveraging sentiment anaylsis algorithms with less noise than the in the 1st task.