In [1]:
#
# The task is to develop an algorithm to disambiguate people contributing to various scientific events (e.g. talks, presentations, sessions).
#
# For this, you have to create one profile per person. This profile will have all the contribution from that person (100% recall) and no other contributions from anyone else (100% precision).
#
# Please avoid unnecessary duplicates as well as mixing contributions from different scientists despite similar names/focus-areas.

# Note this is less a word sense disambigation task
# just entity linking

# https://link.springer.com/article/10.1007/s11192-021-03951-w
# Some sort of larger approach with knowledge base would help
# espciallu some knowledge base approaches


In [2]:
import random
import string

import matplotlib.pyplot as plt
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
# from spacy.pipeline.entity_linker import DEFAULT_NEL_MODEL
import spacy
import xgboost as xgb
from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score,mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import RandomizedSearchCV


In [3]:
%matplotlib inline
%pylab inline
import matplotlib.pyplot as plt


%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [4]:
# !pip install spacy==2.2.4
# !python -m spacy download en_core_web_lg

In [5]:
df = pd.read_json("data.json")
# 1. data.json: List of 5086 various contributions, described by several attributes (features), e.g. names, information about the workplace of the author, its geolocation,
# and focus areas (key topics covered in contribution)
df_ground_truth = pd.read_json("ground-truth.json")
# 2. ground_truth.json: "Ground truth" - actual group s of contributions from the data file
# #(each contribution is assigned to a person)
df_person = pd.read_json("persons.json")  # do i need this to map back?
# 3. persons.json: The list of unique people.


In [6]:
# label df with ground_truth for easier work
df =df.join(df_ground_truth.set_index('contributionId'), on='contribution_id')

In [7]:
df['str_focus_areas'] = [','.join(map(str, l)) for l in df['focus_areas']]
df['str_gpes'] = [','.join(map(str, l)) for l in df['focus_areas']]
df['str_orgs'] = [' '.join(map(str, l)) for l in df['focus_areas']]

df["features"] = df[["first_name","middle_name","last_name","workplace","str_focus_areas","str_gpes","str_orgs"]].agg(' '.join, axis=1)


In [8]:
# contribution distribution
#df_ground_truth.groupby("personId").count().sort_values(by=['contributionId']).plot()


In [9]:
# With this data I don't have free text so remember entity linkers are going to fail more
# This maybe could benefit from a knowledge graph problem.
# Treating like wiki disambiguation 

In [10]:
#  focus only on one prior topics for now
# this does drop 1000 items
#df["focus_areas"] = df[df["focus_areas"].map(lambda d: len(d)) > 0]


In [11]:
# Let's just use names see what is missed
#nlp = spacy.load("en_core_web_lg")


In [12]:
df['cm_full_name'] = df[["first_name","middle_name","last_name"]].agg(' '.join, axis=1)


In [13]:
from sklearn.model_selection import train_test_split

author_mapping_dict = { label:idx for idx,label in enumerate(df["personId"])}
id_author_mapping_dict = dict((v, k) for k, v in author_mapping_dict.items())

d_labels = df["personId"].map(author_mapping_dict)

X_train, X_test, y_train, y_test = train_test_split(df["features"], d_labels, test_size=0.33, random_state=42)



In [14]:
# Unique persons
n_persons = len(df_person)

In [15]:
## Fit transform the tfidf vectorizer 
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
full_tfidf = tfidf_vec.fit_transform(X_train.values.tolist() + X_test.values.tolist())
train_tfidf = tfidf_vec.transform(X_train.values.tolist())
test_tfidf = tfidf_vec.transform(X_test.values.tolist())


In [16]:
# raw tokens
# We do lose any order with those feature columns with below method however has no effect anyway.
vectorizer = CountVectorizer()
full_vec = vectorizer.fit_transform(X_train.values.tolist() + X_test.values.tolist())
train_vec = vectorizer.transform(X_train.values.tolist())
test_vec = vectorizer.transform(X_test.values.tolist())


In [17]:
# This won't give better performance as the features have no textual semantics in them and are just text labels.
# model = SentenceTransformer('distilbert-base-uncased')
# train_embeddings = model.encode(X_train.values.tolist())
# test_embeddings = model.encode(X_test.values.tolist())


In [18]:

               
# params to avoid overfitting
# model = RandomForestClassifier()
# model.fit(train_tfidf, y_train)

# #%%
# y_pred = model.predict(test_tfidf)
# print(f1_score(y_test, y_pred, average='macro'))
# print(accuracy_score(y_test, y_pred))

In [19]:
model = RandomForestClassifier(n_estimators=50, min_samples_leaf=25)

In [20]:
model.fit(train_vec, y_train)



RandomForestClassifier(min_samples_leaf=25, n_estimators=50)

In [21]:
y_pred = model.predict(test_vec)
print("RF")

print(f1_score(y_test, y_pred, average='macro'))
print(accuracy_score(y_test, y_pred))


RF
0.11687880468069527
0.48392857142857143


In [22]:
# Eval fit - how much overfitting? not really standardise labels. O
# mse_train = mean_squared_error(y_train, model.predict(train_vec))
# mse_test = mean_squared_error(y_test, y_pred)
# print("RF with full trees, Train MSE: {} Test MSE: {}".format(mse_train, mse_test))


In [23]:
model = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
# Found with testing adnom forest est approach and most likely because of the closely similar trees
n_estimators = [16,32,64,128]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = model,param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)


In [24]:
rf_random.fit(train_vec, y_train)
model = rf_random

Fitting 3 folds for each of 100 candidates, totalling 300 fits




[CV] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=32; total time=   0.5s
[CV] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=32; total time=   0.5s
[CV] END bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=128; total time=   1.5s
[CV] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=32; total time=   0.6s
[CV] END bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=128; total time=   1.6s
[CV] END bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=128; total time=   1.6s
[CV] END bootstrap=True, max_depth=90, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=16; total time=   0.2s
[CV] END bootstrap=True, max_depth=90, max_fea

In [None]:
y_pred = model.predict(test_vec)
print("Random search + RF")
print(f1_score(y_test, y_pred, average='macro'))
print(accuracy_score(y_test, y_pred))


In [None]:
# do on original data for output reasns + sharing
d_vec = vectorizer.transform(df["features"].values.tolist())

np.argmax(model.predict_proba(d_vec),axis=1)

# res_prob = model.predict_proba(d_vec)


y_res = [id_author_mapping_dict[i] for i in d_labels]
# different metrics but might be able to get a conf out of a randomforrest
# y_conf = [res_prob[idx][m_i] for idx, m_i in enumerate([np.argmax(model.predict_proba(d_vec),axis=1)])][0]


In [None]:
df["pred"] = y_res


In [None]:
df.to_csv("full_output_matches.tsv",sep ='\t', index=False)


In [None]:
df[["index","contribution_id","features","personId","pred"]].to_csv("output_matches.tsv",sep ='\t', index=False)


In [None]:
df[["index","contribution_id","features","personId","pred"]][:25].to_csv("sample_matches.tsv",sep ='\t', index=False)