In [1]:
from glob import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy.linalg import norm
from collections import defaultdict
from scipy import spatial
from IPython.display import HTML, display
import tabulate
import json
import pickle
import re
import nltk

# Part 1: Feature Preparation

# Dataset Prep

### 1. Test set 

In [2]:
df = pd.read_excel("InputTestSet-Reviews48_Ann_NEW.xlsx")

In [3]:
df.head()

Unnamed: 0,UID,PID,Dec,Sent,MComp,Cat,SubCat
0,0,2019_SJf_XhCqKm,Reject,The authors propose to use k-DPP to select a s...,0,,
1,1,2019_SJf_XhCqKm,Reject,"This paper covers the related work nicely, wit...",0,,
2,2,2019_SJf_XhCqKm,Reject,The rest of the paper are also clearly written.,0,,
3,3,2019_SJf_XhCqKm,Reject,"However, I have some concerns about the propos...",0,,
4,4,2019_SJf_XhCqKm,Reject,"- It is not clear how to define the kernel, th...",0,,


In [4]:
df.shape

(1505, 7)

In [5]:
gt_dict = {}

for i in range(0, df.shape[0]):
    pid = df.loc[i]["PID"]
    if not pid in gt_dict:
        gt_dict[pid] = {"dec": df.loc[i]["Dec"], "mcomp": set(), "not_mcomp": set()}
    if df.loc[i]["MComp"] == 1:
        gt_dict[pid]["mcomp"].add(df.loc[i]["UID"])
    else:
        gt_dict[pid]["not_mcomp"].add(df.loc[i]["UID"])

In [6]:
stats_dict = {"Accept": [0, 0], "Reject": [0, 0]}

for k, v in gt_dict.items():
    #print(len(v["mcomp"]), len(v["not_mcomp"]), v["dec"])
    stats_dict[v["dec"]][0] += len(v["mcomp"])
    stats_dict[v["dec"]][1] += len(v["not_mcomp"])
    
print(stats_dict)

{'Accept': [49, 643], 'Reject': [68, 745]}


In [7]:
test_set = list(gt_dict.keys())
print("TestSet length: %d\n"%len(test_set), test_set)

TestSet length: 32
 ['2019_SJf_XhCqKm', '2017_Bk0MRI5lg', '2020_SyevYxHtDB', '2018_rJBiunlAW', '2020_rkltE0VKwH', '2018_Hki-ZlbA-', '2019_BJx0sjC5FX', '2020_r1e_FpNFDr', '2020_B1lsXREYvr', '2018_SkZxCk-0Z', '2019_rJzoujRct7', '2018_HkfXMz-Ab', '2017_BJ9fZNqle', '2019_SyxZJn05YX', '2017_B1ckMDqlg', '2017_HJ0NvFzxl', '2017_S1_pAu9xl', '2018_SyYYPdg0-', '2017_BJAA4wKxg', '2019_HyVxPsC9tm', '2019_HylTBhA5tQ', '2019_B1l08oAct7', '2018_H135uzZ0-', '2017_H1oyRlYgg', '2017_r1y1aawlg', '2020_r1eX1yrKwB', '2020_Byg79h4tvB', '2019_H1lFZnR5YX', '2020_BkeWw6VFwr', '2018_HyHmGyZCZ', '2018_HyUNwulC-', '2020_HkgsPhNYPS']


In [8]:
for k in test_set:
    print('{:20}{}'.format(k, gt_dict[k]["mcomp"]))

2019_SJf_XhCqKm     {39, 17, 27, 28, 30}
2017_Bk0MRI5lg      {57}
2020_SyevYxHtDB     {76, 87}
2018_rJBiunlAW      {108, 110, 112, 113, 124, 126}
2020_rkltE0VKwH     {160, 184, 155, 157, 159}
2018_Hki-ZlbA-      {267, 235, 236, 271}
2019_BJx0sjC5FX     {292, 287}
2020_r1e_FpNFDr     {312, 322, 315, 308}
2020_B1lsXREYvr     {376, 401}
2018_SkZxCk-0Z      {449, 443, 445, 486}
2019_rJzoujRct7     {518, 519}
2018_HkfXMz-Ab      {573, 566}
2017_BJ9fZNqle      {627, 623, 615}
2019_SyxZJn05YX     {672, 673, 657, 669, 671}
2017_B1ckMDqlg      {714, 707}
2017_HJ0NvFzxl      {739}
2017_S1_pAu9xl      {806, 808, 809, 810, 792}
2018_SyYYPdg0-      {834, 867, 868, 869, 870, 872, 873, 844, 830}
2017_BJAA4wKxg      {884}
2019_HyVxPsC9tm     {931, 933, 905, 909, 912, 913, 919, 926}
2019_HylTBhA5tQ     {972, 950}
2019_B1l08oAct7     {994, 996, 1064, 1004, 1007, 1044, 1047, 1048, 1055}
2018_H135uzZ0-      {1072, 1079}
2017_H1oyRlYgg      set()
2017_r1y1aawlg      {1125, 1162, 1100, 1102, 1168}
2020_r1eX

In [9]:
sents_for_test = defaultdict(list)

for i in range(0, df.shape[0]):
    pid = df.loc[i]["PID"]
    test_sent_raw = str(df.loc[i]["Sent"])
    
    # Replace URLs with [URL]
    test_sent_raw = re.sub(r'http[s]?://[a-zA-z\.\-/0-9~]*', "[URL]", test_sent_raw)
    test_sent_raw = re.sub(r'papers.nips.cc/paper/[a-zA-z\.\-/0-9~]*', "[URL]", test_sent_raw)
    test_sent_raw = re.sub(r'arxiv.org/[a-zA-z\.\-/0-9~]*', "[URL]", test_sent_raw)
    
    sents_for_test[pid].append((df.loc[i]["UID"], test_sent_raw))

### 2. Train Set

In [10]:
df_train = pd.read_excel("InputTrainSet-Reviews7_Ann.xlsx")
df_train.head()

Unnamed: 0,UID,PID,Dec,Sent,MComp,Cat,SubCat
0,243,2020_ryen_CEFwr,Reject,It extends this approach by introducing an add...,0,,
1,179,2018_H1LAqMbRW,Reject,"Experimentally, the results are rather weak co...",0,,
2,157,2017_HyTqHL5xg,Accept,The experiments are interesting but I'm still ...,0,,
3,146,2017_HyTqHL5xg,Accept,Section 2.2 says they do the latter in the int...,0,,
4,90,2017_ByToKu9ll,Reject,4)This paper proposed an improved version of t...,0,,


In [11]:
train_sets = {"mcomp": [], "non_mcomp": []}

In [12]:
for i in range(0, df_train.shape[0]):
    pid = df_train.loc[i]["PID"]
    train_sent_raw = str(df_train.loc[i]["Sent"])
    
    type_comp = df_train.loc[i]["MComp"]
    
    if type_comp == 1:
        train_sets["mcomp"].append(train_sent_raw)
    else:
        train_sets["non_mcomp"].append(train_sent_raw)

In [13]:
len(train_sets["mcomp"]), len(train_sets["non_mcomp"])

(26, 270)

## Load entities

In [14]:
with open("entities_dict_smaller", "r") as f:
    entity_dict = json.load(f)

In [15]:
set(entity_dict.values())

{'Material', 'Method', 'Metric', 'Task'}

In [16]:
list(entity_dict.items())[0:20]

[('convolutional neural networks', 'Method'),
 ('convnets', 'Method'),
 ('recognition', 'Task'),
 ('visual recognition tasks', 'Task'),
 ('age estimation', 'Task'),
 ('head pose estimation', 'Task'),
 ('multi - label classification', 'Task'),
 ('semantic segmentation', 'Task'),
 ('classification', 'Task'),
 ('deep convnets', 'Method'),
 ('dldl', 'Method'),
 ('feature learning', 'Task'),
 ('deep learning', 'Method'),
 ('image classification', 'Task'),
 ('deep learning methods', 'Method'),
 ('image classification tasks', 'Task'),
 ('human pose estimation', 'Task'),
 ('convnet', 'Method'),
 ('recognition tasks', 'Task'),
 ('ensemble', 'Method')]

In [17]:
entity_key_map = {}
for i in entity_dict:
    s = re.sub('[^0-9a-zA-Z,:;.?!\- ]+', '', i)
    while s.find("  ") > -1:
        s = s.replace("  ", " ")
    if len(s) > 2:
        cl = re.sub('[^0-9a-zA-Z ]+', '', i)
        while cl.find("  ") > -1:
            cl = cl.replace("  ", " ")
        entity_key_map[cl.strip()] = i
print(len(entity_key_map))

1784


In [18]:
coun = 0
for i in entity_dict:
    if len(i) < 5:
        coun +=1
#         print(i)
print(coun)

212


In [19]:
list(entity_key_map.items())[0:5]

[('convolutional neural networks', 'convolutional neural networks'),
 ('convnets', 'convnets'),
 ('recognition', 'recognition'),
 ('visual recognition tasks', 'visual recognition tasks'),
 ('age estimation', 'age estimation')]

In [20]:
from collections import Counter
c = Counter(entity_dict.values())
c

Counter({'Material': 165, 'Method': 1191, 'Metric': 158, 'Task': 289})

In [21]:
# dir(c)
reverse_map = defaultdict(list)

for k, v in entity_dict.items():
    reverse_map[v].append(k)

In [22]:
# for i in reverse_map["Task"]:
#     print(i)

In [23]:
"MNIST" in entity_key_map, "mnist" in entity_key_map

(False, True)

## A. RoBERTa trained on SciLit

In [24]:
import spacy
import torch

In [26]:
!pip3 list | grep -E 'transformers|tokenizers'

tokenizers (0.7.0)
transformers (2.10.0)


In [27]:
from transformers import AutoTokenizer, AutoModel

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [29]:
tokenizer = AutoTokenizer.from_pretrained("./trained_lm/MaskedRoBERTa/")
model = AutoModel.from_pretrained("./trained_lm/MaskedRoBERTa/")

In [30]:
import transformers
print(transformers.__version__)

2.10.0


In [31]:
def embed_text_using_roberta(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    outputs = model(input_ids)
    last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    return last_hidden_states

In [32]:
def mask_entities(sentence, replace_with_dataset=True):
    cleaned_sent = re.sub('[^0-9a-zA-Z,:;.?!\- ]+', ' ', sentence)
    while cleaned_sent.find("  ") > -1:
        cleaned_sent = cleaned_sent.replace("  ", " ")
    
    entity_key_map_keys = list(entity_key_map.keys()) # As we will be dunamically adding entries to this dict an dthat will throw an error.
    entities_found = []
    for i in entity_key_map_keys:
        if cleaned_sent.find(" " + i + " ") > -1:
            entities_found.append(i)
        elif cleaned_sent.lower().find(" " + i + " ") > -1:
            found_idx = cleaned_sent.lower().find(" " + i + " ")
            entity_dict[cleaned_sent[found_idx:found_idx+len(" " + i + " ")]] = entity_dict[i]
            entity_key_map[cleaned_sent[found_idx:found_idx+len(" " + i + " ")]] = entity_key_map[i]
    
    entities_found.sort(key=lambda s: len(s))
    len_sorted_entities = entities_found.copy()
    
    subset_entities = []
    # Remove subset entities (eg: Among cnn and 3-layer-cnn, prefer the latter)
    for fe in len_sorted_entities:
        for other_ent in len_sorted_entities:
            if fe != other_ent and other_ent.find(fe) > -1:
                subset_entities.append(fe)
                break
    for se in subset_entities:
        len_sorted_entities.remove(se)
    for maxents in len_sorted_entities:
        mask_name = " " + entity_dict[entity_key_map[i]].lower() + " "
        if replace_with_dataset:
            if mask_name == " material ":
                mask_name = " dataset "
        cleaned_sent = cleaned_sent.replace(" " + maxents + " ", mask_name)
    words_cleaned = nltk.word_tokenize(cleaned_sent)
    dups_removed = [v for i, v in enumerate(words_cleaned) if i == 0 or v != words_cleaned[i-1]]
    new_dup_removed_sent = " ".join(dups_removed)
    return new_dup_removed_sent.strip()

#     #print(cleaned_sent)
#     for i in entity_key_map:
#         if cleaned_sent.find(" " + i + " ") > -1:
#             #print("Substituting ent: {} with mask: {}".format(i, entity_dict[entity_key_map[i]].lower()))
#             cleaned_sent = cleaned_sent.replace(i, entity_dict[entity_key_map[i]].lower())
#     return cleaned_sent

In [33]:
nlp = spacy.load('en_core_web_sm')
sp_toks = ["result", "method", "task", "dataset", "metric", "baseline", "fair", "unfair"]

In [34]:
def extract_chunks_using_spacy_dp(conssentence, replace_with_dataset=True):
    
    conssentence = mask_entities(conssentence, replace_with_dataset)
#     print(conssentence)
    doc = nlp(conssentence)
    verb_subtree = []

    for s in doc.sents:
#         find_special_tokens = {"compar": [], "result": [], "method": [], "technique": [], "task": [], "dataset": [], "material": [], "metric": []}
        find_special_tokens = {"compar": [], "result": [], "method": [], "baseline": [], "task": [], 
                               "dataset": [],  "metric": [], "unfair": [], "fair": []}

        for tok in s:

            if tok.text.lower().startswith("compar"):
                find_special_tokens["compar"].append(tok)
            else:
                for k in sp_toks:
                    if tok.text.lower().startswith(k):
                        find_special_tokens[k].append(tok)
                        break

        verb_tokens = []
        if find_special_tokens["compar"]:
            for t in find_special_tokens["compar"]:
#                     verb_subtree.append(t.subtree)
                if t == s.root:
                    simplified_sent = ""
                    for chh in t.lefts:
                        simplified_sent = simplified_sent + " " + chh.text
                    simplified_sent = simplified_sent + " " + t.text
                    for chh in t.rights:
                        simplified_sent = simplified_sent + " " + chh.text
#                         print("SIMP: ", simplified_sent)
                    verb_subtree.append(simplified_sent)
                else:
                    verb_subtree.append(t.subtree)
        else:
            for k in sp_toks:
                for i in find_special_tokens[k]:
                    local_vt = []
                    for j in i.ancestors:
                        if j.pos_ == "NOUN":
                            local_vt.append(j)
                    if not local_vt:
                        for j in i.ancestors:
                            if j.pos_ == "VERB":
                                local_vt.append(j)
                    verb_tokens = verb_tokens + local_vt


            for i in verb_tokens:
                verb_subtree.append(i.subtree)

    eecc = []
    for i in verb_subtree:
        if type(i) == str:
            eecc.append(i)
        else:
            local_chunk = ""
            for lcaltok in i:
                local_chunk = local_chunk + " " + lcaltok.text
            eecc.append(local_chunk)
#     if not eecc:
#         print(conssentence)
    return list(set(eecc))


### Computing vectors of the initial training pool of sentences

In [35]:
train_pool_roberta_vecs = {"mcomp": [], "non_mcomp": []}
single_train_pool_roberta_vecs = {"mcomp": [], "non_mcomp": []}
train_pool_uid_vecs = defaultdict(list)
mc_nmc_fake = {}

In [36]:
fake_idx = 0

for i in train_sets["mcomp"]:
    mcomp_chunks_from_sent = extract_chunks_using_spacy_dp(i)
    if mcomp_chunks_from_sent:
        final_chunks = mcomp_chunks_from_sent
    else:
        final_chunks = [i]
    
    mc_nmc_fake[fake_idx] = 1
    for single_chunk in final_chunks:
        vec = embed_text_using_roberta(single_chunk.strip()).mean(1).detach().numpy()
        train_pool_uid_vecs[fake_idx].append(vec / norm(vec))
        train_pool_roberta_vecs["mcomp"].append(vec/norm(vec))
    
    collated_chunk = " ".join(final_chunks)
    vec = embed_text_using_roberta(collated_chunk.strip()).mean(1).detach().numpy()
    single_train_pool_roberta_vecs["mcomp"].append(vec/norm(vec))
    fake_idx += 1


for i in train_sets["non_mcomp"]:
    mcomp_chunks_from_sent = extract_chunks_using_spacy_dp(i)
    if mcomp_chunks_from_sent:
        final_chunks = mcomp_chunks_from_sent
    else:
        final_chunks = [i]
    
    mc_nmc_fake[fake_idx] = 0
    for single_chunk in final_chunks:
        vec = embed_text_using_roberta(single_chunk.strip()).mean(1).detach().numpy()
        train_pool_uid_vecs[fake_idx].append(vec / norm(vec))
        train_pool_roberta_vecs["non_mcomp"].append(vec/norm(vec))
    
    collated_chunk = " ".join(final_chunks)
    vec = embed_text_using_roberta(collated_chunk.strip()).mean(1).detach().numpy()
    single_train_pool_roberta_vecs["non_mcomp"].append(vec/norm(vec))
    fake_idx +=1

In [37]:
roberta_vectors = defaultdict(dict)

skip_uids = []

for pid in gt_dict:
    roberta_vectors[pid] = {}
    
    for mcs in gt_dict[pid]["mcomp"]:
        try:
            mcomp_chunks_from_sent = extract_chunks_using_spacy_dp(df.loc[mcs]["Sent"])
            if mcomp_chunks_from_sent:
                final_chunks = mcomp_chunks_from_sent
            else:
                final_chunks = [df.loc[mcs]["Sent"]]
            
            roberta_vectors[pid][mcs] = []
            for single_chunk in final_chunks:
                vec = embed_text_using_roberta(single_chunk.strip()).mean(1).detach().numpy()
                roberta_vectors[pid][mcs].append(vec / norm(vec))
        except Exception as ex:
            print(pid, mcs, df.loc[mcs]["Sent"])
            skip_uids.append(mcs)
    
    for mcs in gt_dict[pid]["not_mcomp"]:
        try:
            mcomp_chunks_from_sent = extract_chunks_using_spacy_dp(df.loc[mcs]["Sent"])
            if mcomp_chunks_from_sent:
                final_chunks = mcomp_chunks_from_sent
            else:
                final_chunks = [df.loc[mcs]["Sent"]]
            
            roberta_vectors[pid][mcs] = []
            for single_chunk in final_chunks:
                vec = embed_text_using_roberta(single_chunk.strip()).mean(1).detach().numpy()
                roberta_vectors[pid][mcs].append(vec / norm(vec))
        except Exception as ex:
            print(pid, mcs, df.loc[mcs]["Sent"])
            skip_uids.append(mcs)

2019_HyVxPsC9tm 938 2
2019_HyVxPsC9tm 940 3
2020_Byg79h4tvB 1272 [1] Conditional adversarial domain adaptation, Long et.al, in NeurIPS 2018
[2] Towards Accurate Model Selection in Deep Unsupervised Domain Adaptation, You et.al, in ICML 2019
2018_HyHmGyZCZ 1425 2


In [38]:
mcomp_sentences = {}
not_mcomp_sentences = {}

for pid in gt_dict:
    for mcs in gt_dict[pid]["mcomp"]:
        if not mcs in skip_uids:
            mcomp_sentences[mcs] = pid
    for mcs in gt_dict[pid]["not_mcomp"]:
        if not mcs in skip_uids:
            not_mcomp_sentences[mcs] = pid
print(len(mcomp_sentences), len(not_mcomp_sentences))

117 1384


In [39]:
testdf = df.copy()

xtest = testdf.drop(columns=["PID", "Dec", "MComp", "Sent", "Cat", "SubCat"])
ytest = testdf.drop(columns=["PID", "Dec", "Sent", "Cat", "SubCat"])
# print(xtest.head())
# print(ytest.head())

for i in range(1, 769):
    xtest[i] = np.nan

for pid in gt_dict:
    for mcs in gt_dict[pid]["mcomp"]:
        if not mcs in skip_uids:
            xtest.iloc[mcs] = [mcs] + list(np.mean(roberta_vectors[pid][mcs], axis=0)[0])
        else:
            xtest.iloc[mcs] = [mcs] + list(np.zeros(768))
    for mcs in gt_dict[pid]["not_mcomp"]:
        if not mcs in skip_uids:
            xtest.iloc[mcs] = [mcs] + list(np.mean(roberta_vectors[pid][mcs], axis=0)[0])
        else:
            xtest.iloc[mcs] = [mcs] + list(np.zeros(768))
print(xtest.shape)

(1505, 769)


In [40]:
xtest.head(2)

Unnamed: 0,UID,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,0,-0.044279,-0.010189,-0.064058,-0.032506,0.027601,-0.064169,0.014854,-0.023475,-0.013241,...,0.034727,-0.000336,0.027505,-0.018115,-0.007931,-0.00222,-0.002881,0.052994,-0.018027,-0.030845
1,1,0.028705,-0.027478,-0.013696,-0.000318,0.057166,-0.04013,0.010131,-0.084298,-0.021699,...,0.024411,-0.001754,0.025493,0.012802,-0.002789,-0.048744,-0.01441,0.047568,0.048159,0.003012


In [41]:
# xtrain = pd.DataFrame(columns=["UID"]+[str(x) for x in range(1,769)])
# ytrain = pd.DataFrame(columns=["UID", "MComp"])
xtlist = []
ytlist = []

for i in train_pool_uid_vecs:
    xtlist.append([i] + list(np.mean(train_pool_uid_vecs[i], axis=0)[0]))
    ytlist.append([i, mc_nmc_fake[i]])

xtrain = pd.DataFrame(xtlist)
ytrain = pd.DataFrame(ytlist)

xtrain = xtrain.rename(columns={0: 'UID'})
ytrain = ytrain.rename(columns={0: 'UID', 1: 'MComp'})

print(xtrain.shape)
print(ytrain.shape)

(296, 769)
(296, 2)


In [42]:
xtrain.head(2)

Unnamed: 0,UID,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,0,-0.029635,-0.021564,0.010732,0.001167,0.037215,-0.058719,0.030839,-0.039269,-0.05062,...,0.013205,0.000783,0.059013,0.002711,-0.026276,0.01793,0.014949,0.030483,0.00098,0.015209
1,1,0.025604,-0.034454,0.010054,0.001458,0.009373,-0.05894,0.070672,-0.067199,-0.111141,...,0.031087,0.023948,0.01481,0.003388,0.003812,-0.022218,-0.029899,0.052838,0.003724,0.015341


In [43]:
ytrain.head(2)

Unnamed: 0,UID,MComp
0,0,1
1,1,1


## ======================================================================

In [45]:
# Preprocessing & results----------------
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# nlp preprocessing
import spacy

# Models-------------------------
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
import sklearn.gaussian_process.kernels as kls
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier

# for visualizing ---------------
from sklearn import tree
from six import StringIO 
from IPython.display import Image, display
import seaborn as sns
import graphviz
import matplotlib.pyplot as plt

# General purpose
import re
import pandas as pd
import pickle
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [46]:
np.random.seed(42)

In [47]:
clf_dict = {
    'DecisionTree': {"model": DecisionTreeClassifier(random_state=42), "params": {'max_depth': list(range(10, 250, 20))}},
    'RandomForest': {"model": RandomForestClassifier(random_state=42),
                     "params": {'n_estimators': list(range(5, 100, 5)), 'max_depth': list(range(10, 250, 20))}},
    'LogisticR_L1': {"model": LogisticRegression(random_state=42, max_iter=1000),
                     "params": {'penalty': ['l1'], 'solver': ['liblinear', 'saga']}},
    'LogisticR_L2': {"model": LogisticRegression(random_state=42, max_iter=1000),
                     "params": {'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}},
    'LogisticR': {"model": LogisticRegression(random_state=42, max_iter=1000),
                  "params": {'penalty': ['none'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']}},
    'RidgeClf': {"model": RidgeClassifier(max_iter=1000), "params": {}},
    'SVC_linear': {"model": SVC(random_state=42), "params": {'kernel': ['linear'], 
                                                             'C': [0.5, 1.0, 1.5, 2.0, 2.5]}},
    'SVC_poly': {"model": SVC(random_state=42),
                 "params": {'kernel': ['poly'], 'degree': [3, 4, 5], 'gamma': ['scale', 'auto'], 
                            'C': [0.5, 1.0, 1.5, 2.0, 2.5]}},
    'SVC_others': {"model": SVC(random_state=42), "params": {'kernel': ['rbf', 'sigmoid'], 
                                                             'gamma': ['scale', 'auto'], 
                                                             'C': [0.5, 1.0, 1.5, 2.0, 2.5]}},
    'GussianNB': {"model": GaussianNB(), "params": {}},
    'KNN': {"model": KNeighborsClassifier(), "params": {'n_neighbors': list(range(1, 20))}},
    'GaussianProcessClf': {"model": GaussianProcessClassifier(random_state=42, kernel=kls.RBF()), "params": {}},
    'Bagging_SVC': {"model": BaggingClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                            'base_estimator': [SVC(kernel='linear'),
                                                                                               SVC(kernel='poly',
                                                                                                   degree=3,
                                                                                                   gamma='scale')]}},
    'BaggingDT': {"model": BaggingClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                          'base_estimator': [
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=10),
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=50),
                                                                              DecisionTreeClassifier(random_state=42,
                                                                                                     max_depth=100)]}},
    'AdaBoost': {"model": AdaBoostClassifier(random_state=42), "params": {'n_estimators': list(range(5, 100, 5)),
                                                                          'base_estimator': [DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=10),
                                                                                             DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=50),
                                                                                             DecisionTreeClassifier(
                                                                                                 random_state=42,
                                                                                                 max_depth=100)]}},
    'ExtraTrees': {"model": ExtraTreesClassifier(random_state=42), "params": {'n_estimators': list(range(5, 105, 5)), 
                                                                              'max_depth': [10, 50, 100, 250, 400]}},
    'MLP_l1': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x,) for x in 
                                                                                          range(50, 600, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                   [True]}},
    'MLP_l2': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x, y) for x in 
                                                                                          range(50, 600, 100) 
                                                                                          for y in range(50, 360, 100)], 
                                                                  'activation': ['logistic', 'tanh', 'relu'],
                                                                  'solver': ['adam', 'sgd'], 'early_stopping': 
                                                                                               [True]}},
#     'MLP_l3': {"model": MLPClassifier(random_state=42), "params": {'hidden_layer_sizes': [(x, y, z) for x in 
#                                                                                           range(50, 600, 100) 
#                                                                                           for y in range(50, 600, 100)
#                                                                                           for z in range(50, 360, 100)], 
#                                                                   'activation': ['logistic', 'tanh', 'relu'],
#                                                                   'solver': ['adam', 'sgd'], 'early_stopping': 
#                                                                                                [True]}},
    }


In [48]:
# model_results = pd.DataFrame()
# model_results['Train_Accuracy'] = None
# model_results['Test_Accuracy'] = None
# model_results['best_params'] = None

# # X_train_final = X_train_normalized.drop(columns=["ref_latest"])
# # X_test_normalized_remgsdata = X_test_normalized.drop(columns=["ref_latest"])
# # X_train_normalized_remgsdata = X_train_normalized.copy()
# # X_test_normalized_remgsdata = X_test_normalized.copy()

# xtrain_final = xtrain.drop(columns=["UID"])
# ytrain_final = ytrain.drop(columns=["UID"])

# xtest_final = xtest.drop(columns=["UID"])
# ytest_final = ytest.drop(columns=["UID"])


# best_clf_ours = None
# best_clf_val = 0

# for clf_name, clf in clf_dict.items():
#     classifier = GridSearchCV(clf['model'], clf['params'], n_jobs=5)
#     classifier.fit(xtrain_final, ytrain_final)
#     best_model = classifier.best_estimator_
#     print(clf_name, classifier.best_score_, classifier.best_params_)
    
#     y_predicted = best_model.predict(xtest_final)
#     test_acc = accuracy_score(ytest_final, y_predicted)
    
#     if test_acc > best_clf_val:
#         best_clf_val = test_acc
#         best_clf_ours = best_model
    
#     model_results.loc[clf_name, ['Train_Accuracy', 'Test_Accuracy', 'best_params']] = [classifier.best_score_, test_acc, classifier.best_params_]
#     clsr = classification_report(ytest_final, y_predicted)

# print("================================================================================")
# print(best_clf_ours)
# best_y_hat = best_clf_ours.predict(xtest_final)
# clsr = classification_report(ytest_final, best_y_hat)
# print(clsr)
# test_acc = accuracy_score(ytest_final, best_y_hat)
# print("Test acc:", test_acc )
# print("Weighted F1 score: ", f1_score(ytest_final, best_y_hat, average='weighted'))

In [49]:
# test_acc

In [50]:
# precision_recall_fscore_support(ytest_final, best_y_hat, average='macro')[2]

In [51]:
from sklearn.metrics import precision_recall_fscore_support

In [52]:
model_results = pd.DataFrame()
model_results['Train_Accuracy'] = None
model_results['Test_Accuracy'] = None
model_results['best_params'] = None

# X_train_final = X_train_normalized.drop(columns=["ref_latest"])
# X_test_normalized_remgsdata = X_test_normalized.drop(columns=["ref_latest"])
# X_train_normalized_remgsdata = X_train_normalized.copy()
# X_test_normalized_remgsdata = X_test_normalized.copy()

xtrain_final = xtrain.drop(columns=["UID"])
ytrain_final = ytrain.drop(columns=["UID"])

xtest_final = xtest.drop(columns=["UID"])
ytest_final = ytest.drop(columns=["UID"])


best_clf_ours = None
best_clf_val = 0

for clf_name, clf in clf_dict.items():
    classifier = GridSearchCV(clf['model'], clf['params'], n_jobs=5)
    classifier.fit(xtrain_final, ytrain_final)
    best_model = classifier.best_estimator_
    print(clf_name, classifier.best_score_, classifier.best_params_)
    
    y_predicted = best_model.predict(xtest_final)
    test_acc_macro = precision_recall_fscore_support(ytest_final, y_predicted, average='macro')[2]#accuracy_score(ytest_final, y_predicted)
    
    if test_acc_macro > best_clf_val:
        best_clf_val = test_acc_macro
        best_clf_ours = best_model
    
    model_results.loc[clf_name, ['Train_Accuracy', 'Test_Accuracy', 'best_params']] = [classifier.best_score_, test_acc_macro, classifier.best_params_]
    clsr = classification_report(ytest_final, y_predicted)

print("================================================================================")
print(best_clf_ours)
best_y_hat = best_clf_ours.predict(xtest_final)
clsr = classification_report(ytest_final, best_y_hat)
print(clsr)
test_acc = accuracy_score(ytest_final, best_y_hat)
print("Test acc:", test_acc )
print("Weighted F1 score: ", f1_score(ytest_final, best_y_hat, average='weighted'))

DecisionTree 0.8547297297297297 {'max_depth': 10}
RandomForest 0.918918918918919 {'max_depth': 30, 'n_estimators': 45}
LogisticR_L1 0.9121621621621622 {'penalty': 'l1', 'solver': 'liblinear'}
LogisticR_L2 0.9121621621621622 {'penalty': 'l2', 'solver': 'newton-cg'}
LogisticR 0.9087837837837838 {'penalty': 'none', 'solver': 'lbfgs'}
RidgeClf 0.9155405405405406 {}
SVC_linear 0.9121621621621622 {'C': 0.5, 'kernel': 'linear'}
SVC_poly 0.9222972972972973 {'C': 2.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
SVC_others 0.9222972972972973 {'C': 2.5, 'gamma': 'scale', 'kernel': 'rbf'}
GussianNB 0.9054054054054054 {}
KNN 0.9222972972972973 {'n_neighbors': 3}
GaussianProcessClf 0.9121621621621622 {}
Bagging_SVC 0.9155405405405406 {'base_estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False), 