In [1]:
%autosave 60
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
import sklearn

from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

import pickle
import os
import csv
import pandas as pd
import pickle
import itertools

Autosaving every 60 seconds
Automatically created module for IPython interactive environment


In [2]:
TEST_SPLIT = 0.10
DEV_SPLIT = 0.10
FEATURE_TYPE = 'deepmag' #['raw', 'deepmag']

TRAIN_DATASET_FOR_RANKING = "data/task2_ranking_train_dataset.pkl"
TEST_DATASET_FOR_RANKING = "data/task2_ranking_test_dataset.pkl"
DEV_DATASET_FOR_RANKING = "data/task2_ranking_dev_dataset.pkl"
FULL_DATASET_FOR_RANKING = "data/task2_ranking_all_dataset.pkl"

if FEATURE_TYPE == 'raw':
    DATA_DIR = "C:/Wasif/PDMotorFeatureExtraction/TASK2_FEATURES_04_21/"
    X_file = "x_repeat_removed_raw_pixels.npy"
    y_file = "y_repeat_removed_raw_pixels.npy"
    X_index_file = "index_repeat_removed.pickle"

elif FEATURE_TYPE == 'deepmag':
    #INPUT_DIR = "E:/Wasif/PDMotorFeatureExtraction/"
    INPUT_DIR = "E:/Saiful/park_motor/"
    assert(os.path.exists(INPUT_DIR))
    
    survey_file = "data/survey_april 9th 2021.csv"
    feature_file = "DeepMAGFeatures/deepMAG_features.pickle"
    superpd_survey_file = "data/SUPERPD_DATA_2020-03-24_1110.csv"
    
    #DATA_DIR = "E:/Wasif/PDMotorFeatureExtraction/Task2_features_deepMAG/"
    DATA_DIR = "E:/Saiful/park_motor/Task2_features_deepMAG/"
    X_file = "x_repeat_removed_deepmag.npy"
    y_file = "y_repeat_removed_deepmag.npy"
    X_index_file = "index_repeat_removed.pickle"
    
    rating_file = "data/task2_dr_saloni_rating.csv"
    
IMG_HEIGHT = 492
IMG_WIDTH = 492

In [3]:
#Rating CSV file from Dr. Saloni.
def read_rating_data():
    rating_full_path = os.path.join(INPUT_DIR,rating_file)
    data = pd.read_csv(rating_full_path)
    return data

#PARK users' self reported survey: PD/Non-PD, diagnosis time, medication
def read_survey(list_columns):
    survey_full_path = os.path.join(INPUT_DIR,survey_file)
    data = pd.read_csv(survey_full_path)
    data = data[list_columns]
    return data

#SuperPARK users' profile, obtained from clinic. 
def read_superpd_survey(list_columns):
    survey_full_path = os.path.join(INPUT_DIR,superpd_survey_file)
    data = pd.read_csv(survey_full_path)
    data = data[list_columns]
    return data

#Read extracted features (DeepMAG frequency features)
def read_features():
    pickle_filename = os.path.join(INPUT_DIR,feature_file)
    with open(pickle_filename, 'rb') as handle:
        features = pickle.load(handle)
    return features

## Process SuperPD Survey

In [4]:
def mins_to_hour(row):
    mins = row["last-medication"]
    if np.isnan(mins):
        return mins
    else:
        return mins/60.0
    
def num_to_yesno(row):
    diagnosed = row["diagnosed"]
    if np.isnan(diagnosed):
        return diagnosed
    else:
        if diagnosed==1.0:
            return "yes"
        else:
            return "no"

def concat_for_unique_id(row):
    patient_id = str(row['guid'])
    visit_date = str(row['visitdate'])
    return patient_id+"-task2-"+visit_date

#Read SuperPARK users clinical record
superpd_data = read_superpd_survey(['guid', 'visitdate', 'redcap_event_name', 
                                    'cohort', 'mdsupdrsptntprknsnmedind_2', 'mdsupdrsptclinstateprknsnm_2', 
                                    'mdsupdrsptntuseldopaind_2', 'mdsupdrslstldopadosetm_2'])
superpd_data = superpd_data.rename(columns={"mdsupdrsptntprknsnmedind_2": "on_pd_medication", "mdsupdrsptclinstateprknsnm_2": "on_off_state", "mdsupdrsptntuseldopaind_2": "on_levodopa", "mdsupdrslstldopadosetm_2":"last-medication"})
# guid = unique user id; cohort = PD status; 
# redcap_event_name = screening (first visit), baseline (second visit), month6_visit (third visit)
# mdsupdrsptntprknsnmedind_2 --> Is the patient on medication for treating the symptoms of Parkinson's Disease?
# mdsupdrsptclinstateprknsnm_2 --> If the patient is receiving medication for treating the symptoms of Parkinson's Disease, mark the patient's clinical state using the following definitions:
#0, ON (On is the typical functional state when patients are receiving medication and have a good response)
#1, OFF (Off is the typical functional state when patients have a poor response in spite of taking medications.)
# mdsupdrsptntuseldopaind_2 --> Is the patient on Levodopa?
# mdsupdrslstldopadosetm_2 --> If the patient is on Levodopa, minutes since last levodopa dose:
#cohort is the diagnosis result (1: PD/0: Non-PD).

superpd_data["last-medication"] = pd.to_numeric(superpd_data["last-medication"], errors='coerce')
superpd_data["on_off_state"] = pd.to_numeric(superpd_data["on_off_state"], errors='coerce')
#In super-pd survey, the unit is minutes, in normal PD survey, it is in hour
superpd_data["last-medication"] = superpd_data.apply(mins_to_hour, axis=1)

#Some users' cohort is N/A during baseline visit, but has a valid status in screening visit
#if the user with cohort N/A is assigned PD/Non-PD in any visit, assign PD/Non-PD to all instances of visits
superpd_data['cohort']=superpd_data['cohort'].fillna(-1.0)
superpd_data = superpd_data.join(superpd_data.groupby('guid')['cohort'].max(), on='guid', rsuffix='_corrected')
#One user is not assigned a status. Just ignore.
#superpd_data[superpd_data['cohort_corrected']==-1]
superpd_data = superpd_data.rename(columns={"cohort_corrected": "diagnosed"})

#Convert to yes/no to match normal PARK survey data
superpd_data["diagnosed"] = superpd_data.apply(num_to_yesno, axis=1)

#Make an id similar to normal PD survey
superpd_data["id"] = superpd_data.apply(concat_for_unique_id, axis=1)

superpd_data = superpd_data.drop(columns = ['cohort', 'on_pd_medication', 'on_levodopa', 'redcap_event_name', 'guid', 'visitdate'])

#Count instances where participants are in 'on' state and under the effect of medication
#superpd_data[(superpd_data["on_off_state"]==0) & (superpd_data["diagnosed"]=='yes') & ((superpd_data['last-medication']>=0.75) & (superpd_data['last-medication']<=3.0))].count()
superpd_data

Unnamed: 0,on_off_state,last-medication,diagnosed,id
0,,,yes,NIHAF261TBMPV-task2-2019-09-17
1,0.0,0.750000,yes,NIHAF261TBMPV-task2-2019-10-08
2,,,no,NIHAG749MLBRQ-task2-2020-01-13
3,,,no,NIHAG749MLBRQ-task2-2020-02-05
4,,,yes,NIHAV871KZCVE-task2-2019-08-02
...,...,...,...,...
157,,,no,NIHYW557MLDFE-task2-2020-03-20
158,,,no,NIHZT156UUPLX-task2-2020-01-21
159,,,no,NIHZT156UUPLX-task2-2020-02-14
160,,,yes,NIHZY217YWJA8-task2-2020-02-18


### Findings

1. 38 instances where participants are in 'on' state and under the effect of medication

## Process Normal Users Survey

In [5]:
#Read survey CSV of normal PARK users. 
#'repeat' means the user has already performed the test before. We are removing such cases.
survey_data = read_survey(['id','diagnosed','repeat', 'year-of-diagnosis', 'last-medication'])
survey_data = survey_data[survey_data["repeat"]!='yes']
#'repeat' column is no longer needed
survey_data = survey_data.drop(columns = ["repeat"])
#Make the ID consistent with feature_extraction pipeline
survey_data["id"] = survey_data["id"] + "-task2"
survey_data["year-of-diagnosis"] = pd.to_numeric(survey_data["year-of-diagnosis"], errors='coerce')
survey_data["last-medication"] = pd.to_numeric(survey_data["last-medication"], errors='coerce')
survey_data

Unnamed: 0,id,diagnosed,year-of-diagnosis,last-medication
0,2018-10-24T19-20-17-709Z8-task2,no,,
1,2018-09-19T22-50-04-770Z13-task2,yes,2015.0,4.0
2,2018-10-30T20-09-29-976Z87-task2,no,,
3,2018-09-11T16-58-31-717Z92-task2,yes,2016.0,
4,2018-11-01T20-59-19-606Z70-task2,no,,
...,...,...,...,...
1083,2017-10-13T05-38-01-106Z41-task2,yes,2010.0,
1084,2020-09-03T18-01-57-790Z80-task2,no,,
1085,2020-02-23T22-29-06-220Z56-task2,no,,
1086,2018-09-18T01-16-29-987Z51-task2,yes,2008.0,4.0


## Combine SuperPARK and PARK survey

In [6]:
combined_survey_data = pd.concat([survey_data, superpd_data], join='outer')
combined_survey_data

Unnamed: 0,id,diagnosed,year-of-diagnosis,last-medication,on_off_state
0,2018-10-24T19-20-17-709Z8-task2,no,,,
1,2018-09-19T22-50-04-770Z13-task2,yes,2015.0,4.000000,
2,2018-10-30T20-09-29-976Z87-task2,no,,,
3,2018-09-11T16-58-31-717Z92-task2,yes,2016.0,,
4,2018-11-01T20-59-19-606Z70-task2,no,,,
...,...,...,...,...,...
157,NIHYW557MLDFE-task2-2020-03-20,no,,,
158,NIHZT156UUPLX-task2-2020-01-21,no,,,
159,NIHZT156UUPLX-task2-2020-02-14,no,,,
160,NIHZY217YWJA8-task2-2020-02-18,yes,,,


## Process Ratings from Dr. Saloni Sharma

In [7]:
rating_data = read_rating_data()
#Make the 'File_name' consistent with the 'id' in the survey, remove '.webm'
rating_data["File_name"] = rating_data["File_name"].str[:-5]
#Dr. Saloni made some comments and did not give anyrating. Replace those with na
rating_data['Right'] = pd.to_numeric(rating_data['Right'], errors='coerce')
rating_data['Left'] = pd.to_numeric(rating_data['Left'], errors='coerce')
#'Rating' column was not populated. This contains only comments
rating_data = rating_data.drop(columns = ['Rating'])
#Check how many ratings are available
#rating_data[(rating_data["Right"]>=0.0) & (rating_data["Right"]<=4.0)].shape #(186, 3)
#rating_data[(rating_data["Left"]>=0.0) & (rating_data["Left"]<=4.0)].shape #(187,3)

#Only keep the videos where both Left and Right ratings are not N/A
rating_data = rating_data[((~rating_data["Left"].isna()) & (~rating_data["Right"].isna()))]
rating_data

Unnamed: 0,File_name,Right,Left
0,2017-10-13T17-22-56-936Z44-task2,0.0,1.0
1,2017-10-15T14-47-50-369Z6-task2,1.0,0.0
2,2017-10-15T15-22-00-517Z19-task2,1.0,1.0
4,2017-11-15T22-04-09-657Z62-task2,0.0,0.0
5,2017-11-17T19-41-40-058Z57-task2,0.0,0.0
...,...,...,...
195,NIHYA889LELYV-task2-2019-08-22T13-21-34-659Z-,1.0,0.0
196,NIHYA889LELYV-task2-2020-02-17T14-30-55-008Z-,0.0,0.0
197,NIHYM875FLXFF-task2-2020-03-02T18-39-31-044Z-,1.0,0.0
198,NIHYT60IGVTH5-task2-2019-10-28T14-58-20-600Z-,1.0,0.0


## Read Extracted Features (DeepMAG Frequency Features)

In [8]:
features = read_features()
#dataframe is indexed by 'id'
features_df = pd.DataFrame.from_dict(features, orient='index').rename_axis('id')[["frequency_components"]]
features_df

Unnamed: 0_level_0,frequency_components
id,Unnamed: 1_level_1
2017-08-18T14-59-52-530Z49-task2,"[4795.407543734973, 87.44582005686378, -21.629..."
2017-08-18T15-24-14-004Z53-task2,"[4840.8609913082555, 72.6660925449257, 98.4037..."
2017-08-22T02-01-21-948Z87-task2,"[6229.424923436179, 132.90561812700153, 82.124..."
2017-09-22T18-38-44-872Z33-task2,"[3657.462895487711, -136.14476289189147, -65.1..."
2017-09-28T14-17-07-280Z18-task2,"[6328.233577346078, -835.0499234501573, -264.2..."
...,...
NIHYA889LELYV-task2-2019-08-22T13-21-34-659Z-,"[3099.800019829519, -73.68227785872443, -52.86..."
NIHYA889LELYV-task2-2020-02-17T14-30-55-008Z-,"[3995.0728278472234, -130.80468725127903, -33...."
NIHYM875FLXFF-task2-2020-03-02T18-39-31-044Z-,"[1941.9081372418823, 28.7353869608559, 27.6723..."
NIHYT60IGVTH5-task2-2019-10-28T14-58-20-600Z-,"[3119.464755877225, -3.717710592178981, 1.7810..."


## Combine the Dataframes to Build the Final Dataset

In [9]:
def match_superpd_id(row):
    '''
    Match the ID as in SuperPARK survey. 
    For example, NIHYM875FLXFF-task2-2020-03-02T18-39-31-044Z- --> NIHYM875FLXFF-task2-2020-03-02
    '''
    id_str = row['id']
    if 'NIH' in id_str:
        idx = id_str.find('-task2-')
        return id_str[:(idx+17)]
    else:
        return id_str

#All videos that passed the pipeline has features, but may not be rated. Doing a left outer join to combine
rating_data = rating_data.set_index('File_name')
features_and_rating_data = pd.merge(features_df, rating_data, left_index=True, right_index=True, how='left')
features_and_rating_data = features_and_rating_data.reset_index()

#Rename the IDs to match SuperPARK survey IDs before join
features_and_rating_data["id"] = features_and_rating_data.apply(match_superpd_id, axis=1)
#features_and_rating_data

#Inner join concatenated survey and feature dataframes
dataset = pd.merge(features_and_rating_data, combined_survey_data, on="id")
dataset

Unnamed: 0,id,frequency_components,Right,Left,diagnosed,year-of-diagnosis,last-medication,on_off_state
0,2017-08-18T14-59-52-530Z49-task2,"[4795.407543734973, 87.44582005686378, -21.629...",,,yes,2007.0,,
1,2017-08-18T15-24-14-004Z53-task2,"[4840.8609913082555, 72.6660925449257, 98.4037...",,,no,1.0,,
2,2017-08-22T02-01-21-948Z87-task2,"[6229.424923436179, 132.90561812700153, 82.124...",,,yes,2012.0,,
3,2017-09-22T18-38-44-872Z33-task2,"[3657.462895487711, -136.14476289189147, -65.1...",,,yes,2014.0,,
4,2017-09-28T14-17-07-280Z18-task2,"[6328.233577346078, -835.0499234501573, -264.2...",,,yes,2011.0,,
...,...,...,...,...,...,...,...,...
819,NIHXZ891UYEBU-task2-2020-02-05,"[2412.1357409610614, 2.214172041482511, 23.503...",1.0,0.0,no,,,
820,NIHYA889LELYV-task2-2019-08-22,"[3099.800019829519, -73.68227785872443, -52.86...",1.0,0.0,yes,,1.000000,0.0
821,NIHYA889LELYV-task2-2020-02-17,"[3995.0728278472234, -130.80468725127903, -33....",0.0,0.0,yes,,0.833333,0.0
822,NIHYM875FLXFF-task2-2020-03-02,"[1941.9081372418823, 28.7353869608559, 27.6723...",1.0,0.0,no,,,


In [10]:
idlist = list(dataset["id"])
textfile = open("videos_with_survey_and_features.txt", "w")
for element in idlist:
    textfile.write(element + "\n")
textfile.close()

## Train, Test, Dev Split

In [11]:
#1. Do for the pair of rows where ratings from Dr. Saloni is available
rated_dataset = dataset[(dataset["Right"]>=0.0) & (dataset["Right"]<=4.0)]
data = rated_dataset.drop(columns=["diagnosed", "year-of-diagnosis", "last-medication", "on_off_state"])

train, dev, test = np.split(data.sample(frac=1, random_state=42), [int((1.0 - (TEST_SPLIT+DEV_SPLIT))*len(data)), int((1.0 - TEST_SPLIT)*len(data))])
train = train.reset_index().drop(columns=["index"])
test = test.reset_index().drop(columns=["index"])
dev = dev.reset_index().drop(columns=["index"])

In [12]:
train

Unnamed: 0,id,frequency_components,Right,Left
0,2018-11-01T17-52-14-880Z1-task2,"[6673.038689492701, -33.66161771982238, -302.4...",1.0,0.0
1,2018-01-21T15-51-40-944Z57-task2,"[1148.1889734395265, -7.7179280163882815, -3.2...",0.0,0.0
2,2018-10-25T15-02-49-153Z42-task2,"[4853.153327218894, -141.9896274130897, -0.463...",1.0,0.0
3,2020-08-25T12-25-54-209Z95-task2,"[3013.381976116398, -81.43337499069044, -40.36...",0.0,0.0
4,2018-10-31T14-24-03-805Z44-task2,"[6669.977701489325, -180.45410911395123, -168....",0.0,0.0
...,...,...,...,...
138,NIHMR963TPLWF-task2-2019-10-24,"[2860.7051413399663, -11.209734972796312, -7.1...",2.0,2.0
139,NIHER370FCDFN-task2-2020-02-12,"[462.12689619272913, 68.40624158895007, 47.954...",0.0,1.0
140,2018-02-05T16-34-48-797Z22-task2,"[3753.777586368224, -68.29599372466008, -26.69...",1.0,0.0
141,2018-10-28T14-00-14-346Z29-task2,"[1006.2032492784236, -33.081772763938, 102.077...",0.0,0.0


In [13]:
test

Unnamed: 0,id,frequency_components,Right,Left
0,2017-10-15T14-47-50-369Z6-task2,"[4821.5990523168675, 166.03696077753133, 112.6...",1.0,0.0
1,2018-09-17T15-47-49-907Z54-task2,"[8955.262988300547, -372.7728126581389, -127.9...",1.0,1.0
2,NIHLW568LJGVV-task2-2020-01-29,"[2355.7972120321806, 22.70601090479409, 11.247...",0.0,0.0
3,NIHER370FCDFN-task2-2019-08-07,"[2909.5905986295334, -33.484235186281595, -2.3...",0.0,0.0
4,NIHMF399WYNH5-task2-2019-09-10,"[3180.10051336301, 34.241722629779055, -1.3137...",1.0,1.0
5,2019-10-23T23-15-31-022Z72-task2,"[3437.3380331922717, -20.812428843353274, -10....",0.0,1.0
6,2019-10-23T13-10-30-336Z83-task2,"[3237.9362895762893, -42.8346567143972, 81.881...",0.0,0.0
7,2020-09-05T02-10-55-470Z89-task2,"[6745.655011346611, 15.621256372602677, 118.85...",1.0,1.0
8,2019-10-22T17-18-47-406Z28-task2,"[2554.649857062529, -475.90835543423077, -196....",0.0,0.0
9,2018-10-30T19-25-00-928Z32-task2,"[339.2956160354221, 173.49427246032974, 139.43...",0.0,0.0


In [14]:
dev

Unnamed: 0,id,frequency_components,Right,Left
0,NIHFD867WLWA5-task2-2020-01-22,"[1346.6190346354774, 65.50465726551339, 3.7852...",1.0,1.0
1,NIHRT727YLPFL-task2-2019-09-16,"[2539.202724624061, -20.409600892735483, -8.07...",0.0,0.0
2,NIHTK278VZHYL-task2-2019-10-03,"[3131.9903042172714, -9.695320429581066, 0.989...",3.0,2.0
3,2018-10-24T22-11-34-814Z12-task2,"[4690.720250291839, -426.23136860992656, -417....",1.0,0.0
4,2018-10-13T18-33-39-530Z2-task2,"[4071.679869786457, 328.9241979823008, 207.639...",0.0,0.0
5,2019-10-24T04-31-25-752Z37-task2,"[8072.539372507683, 13.754574232024368, 157.38...",0.0,0.0
6,2018-09-17T04-13-21-831Z76-task2,"[8260.129272974498, 9.300111162143187, -78.172...",0.0,1.0
7,NIHYA889LELYV-task2-2020-02-17,"[3995.0728278472234, -130.80468725127903, -33....",0.0,0.0
8,2018-10-24T18-59-40-527Z11-task2,"[2715.4993362636556, -137.03913279462597, -48....",0.0,0.0
9,2018-09-13T00-35-01-889Z24-task2,"[1133.8753387533818, -54.96579415212774, -16.5...",1.0,1.0


## Generate Pairwise Comparison Data

In [15]:
#Generate Pairwise Comparison Data
def rating_comparison(row1, row2):
    ROW1_GT_ROW2 = 1.0
    ROW1_EQ_ROW2 = 0.5
    ROW1_LT_ROW2 = 0.0
    '''Returns a comparison operator
        0.5: row1==row2
        1: row1>row2
        0: row1<row2
    '''
    l1, r1 = (row1["Left"], row1["Right"])
    l2, r2 = (row2["Left"], row2["Right"])
    
    if l1==l2 and r1==r2:
        return ROW1_EQ_ROW2
    elif (l1>=l2 and r1>r2) or (l1>l2 and r1>=r2):
        return ROW1_GT_ROW2
    elif (l1<=l2 and r1<r2) or (l1<l2 and r1<=r2):
        return ROW1_LT_ROW2
    
    elif (l1+r1)>(l2+r2):
        return ROW1_GT_ROW2
    elif (l1+r1)<(l2+r2):
        return ROW1_LT_ROW2
    elif (l1+r1)==(l2+r2):
        return ROW1_EQ_ROW2

def generate_pairwise_dataset(split_name):
    '''
    From severity rating dataset, generate pairwise comparison dataset
    split_name: train, test, dev
    '''
    
    split_dataset = {"train": train, "test": test, "dev": dev}
    
    rated_dataset = split_dataset[split_name]
    num_samples = rated_dataset.shape[0]
    data_dict_list = []
    
    count = 0.0

    for i in range(0, num_samples):
        for j in range(0, i):
            row_dict = {}
            row1 = rated_dataset.iloc[i]
            row2 = rated_dataset.iloc[j]
            row_dict["id1"] = row1["id"]
            row_dict["id2"] = row2["id"]
            row_dict["features1"] = row1["frequency_components"]
            row_dict["features2"] = row2["frequency_components"]
            row_dict["label"] = rating_comparison(row1, row2)
            data_dict_list.append(row_dict)
            count +=1

    pairwise_dataset = pd.DataFrame(data_dict_list)
    assert(count==((num_samples*(num_samples-1))/2))
    return pairwise_dataset

pairwise_dataset_train = generate_pairwise_dataset("train")
pairwise_dataset_test = generate_pairwise_dataset("test")
pairwise_dataset_dev = generate_pairwise_dataset("dev")

#Save individually as pickle
pairwise_dataset_train.to_pickle(os.path.join(INPUT_DIR, TRAIN_DATASET_FOR_RANKING))
pairwise_dataset_test.to_pickle(os.path.join(INPUT_DIR, TEST_DATASET_FOR_RANKING))
pairwise_dataset_dev.to_pickle(os.path.join(INPUT_DIR, DEV_DATASET_FOR_RANKING))

#Save in one file
dataset = {}
dataset["train"] = pairwise_dataset_train
dataset["test"] = pairwise_dataset_test
dataset["dev"] = pairwise_dataset_dev

with open(os.path.join(INPUT_DIR, FULL_DATASET_FOR_RANKING), 'wb') as handle:
    pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
dataset["test"]

Unnamed: 0,id1,id2,features1,features2,label
0,2018-09-17T15-47-49-907Z54-task2,2017-10-15T14-47-50-369Z6-task2,"[8955.262988300547, -372.7728126581389, -127.9...","[4821.5990523168675, 166.03696077753133, 112.6...",1.0
1,NIHLW568LJGVV-task2-2020-01-29,2017-10-15T14-47-50-369Z6-task2,"[2355.7972120321806, 22.70601090479409, 11.247...","[4821.5990523168675, 166.03696077753133, 112.6...",0.0
2,NIHLW568LJGVV-task2-2020-01-29,2018-09-17T15-47-49-907Z54-task2,"[2355.7972120321806, 22.70601090479409, 11.247...","[8955.262988300547, -372.7728126581389, -127.9...",0.0
3,NIHER370FCDFN-task2-2019-08-07,2017-10-15T14-47-50-369Z6-task2,"[2909.5905986295334, -33.484235186281595, -2.3...","[4821.5990523168675, 166.03696077753133, 112.6...",0.0
4,NIHER370FCDFN-task2-2019-08-07,2018-09-17T15-47-49-907Z54-task2,"[2909.5905986295334, -33.484235186281595, -2.3...","[8955.262988300547, -372.7728126581389, -127.9...",0.0
...,...,...,...,...,...
148,2019-10-23T16-56-58-249Z86-task2,2018-02-17T06-26-29-296Z17-task2,"[3877.3605947186493, 54.52508593881577, -13.83...","[3559.0869108995867, -731.0575320950912, 227.2...",0.5
149,2019-10-23T16-56-58-249Z86-task2,2018-10-28T12-51-36-399Z32-task2,"[3877.3605947186493, 54.52508593881577, -13.83...","[4332.1457603499775, 111.34482954515173, -71.9...",0.0
150,2019-10-23T16-56-58-249Z86-task2,2019-10-24T03-41-06-904Z64-task2,"[3877.3605947186493, 54.52508593881577, -13.83...","[2790.508640965477, -129.73671910251073, -284....",0.5
151,2019-10-23T16-56-58-249Z86-task2,2018-01-05T22-43-12-113Z13-task2,"[3877.3605947186493, 54.52508593881577, -13.83...","[3053.8969253310534, -190.62540546934835, -108...",0.5


### Observation
    Total datapoints rated: 179

## Select some files which are not rated, but developed PD for a long time

The following codes were used to send data points to Dr. Sharma for rating. This is not relevant to dataset generation

## Test Codes

In [16]:
import numpy as np
import pandas as pd
import torch
from sklearn import preprocessing

TEST_SPLIT = 0.05
DEV_SPLIT = 0.05

filename = "E:/Saiful/park_motor/data/task2_ranking_dataset.pkl"
data = pd.read_pickle(filename)
train, dev, test = np.split(data.sample(frac=1, random_state=42), [int((1.0 - (TEST_SPLIT+DEV_SPLIT))*len(data)), int((1.0 - TEST_SPLIT)*len(data))])

train = train.reset_index()
test = test.reset_index()
dev = dev.reset_index()
#print(dev["features1"][4380])

all_features1 = torch.tensor(dev["features1"], dtype=torch.float)

In [17]:
dev

Unnamed: 0,index,id1,id2,features1,features2,label
0,28141,NIHPV408RRRZH-task2-2019-10-10,2018-09-09T01-35-40-167Z44-task2,"[1288.8391623702205, -1.9664640675756089, -1.8...","[1735.3945113689051, 133.01058871676787, -132....",0.0
1,17563,2019-10-23T03-36-53-604Z67-task2,2018-02-28T15-08-41-257Z30-task2,"[3453.0784379336446, 429.34362408730163, -67.8...","[6974.460761065883, -235.43784389744076, -42.0...",0.0
2,5448,2018-06-13T18-08-07-732Z93-task2,2018-11-01T17-52-14-880Z1-task2,"[5684.3767818979895, -262.9831476185142, -128....","[6673.038689492701, -33.66161771982238, -302.4...",1.0
3,28820,NIHPX181CAEY6-task2-2020-02-21,2017-10-15T14-47-50-369Z6-task2,"[2655.455013274696, -12.360331419476928, 10.15...","[4821.5990523168675, 166.03696077753133, 112.6...",0.0
4,12168,2018-10-25T18-57-16-478Z24-task2,NIHYA889LELYV-task2-2019-08-22,"[2537.656340196596, -123.78128127816001, -52.5...","[3099.800019829519, -73.68227785872443, -52.86...",0.0
...,...,...,...,...,...,...
1597,29674,NIHTN717JDEYY-task2-2020-02-14,NIHGA312KVEC2-task2-2019-08-26,"[3140.146519102532, 107.16869146495193, 29.635...","[1742.122879348685, -16.76346193683742, -12.71...",0.5
1598,28734,NIHPX181CAEY6-task2-2019-08-29,2019-10-23T00-50-01-757Z51-task2,"[2108.915036243875, 8.763086925215338, -4.5309...","[1422.4460638509265, -226.23985050331655, 54.9...",0.5
1599,4380,2018-03-13T18-00-04-749Z5-task2,2018-11-11T21-28-42-685Z99-task2,"[4741.381136118218, -106.62655865068888, 36.76...","[1562.0747212858112, 988.3793488855815, -59.19...",1.0
1600,22704,NIHDL154KMBMU-task2-2020-02-20,NIHME200JKXX8-task2-2019-09-09,"[3301.8097224977273, -41.440212869299444, 35.0...","[1553.603508438505, 12.317204804452762, -2.688...",1.0


In [19]:
all_features1 = torch.tensor(data["features1"])
print(all_features1.shape)

torch.Size([32041, 128])


In [31]:
le = preprocessing.LabelEncoder()
concat_data = pd.concat([data[["id1"]], data[["id2"]]])
concat_data = concat_data.reset_index()
concat_data

Unnamed: 0,index,id1,id2
0,0,2017-10-13T17-22-56-936Z44-task2,
1,1,2017-10-13T17-22-56-936Z44-task2,
2,2,2017-10-13T17-22-56-936Z44-task2,
3,3,2017-10-13T17-22-56-936Z44-task2,
4,4,2017-10-13T17-22-56-936Z44-task2,
...,...,...,...
64077,32036,,NIHXZ891UYEBU-task2-2020-02-05
64078,32037,,NIHYA889LELYV-task2-2019-08-22
64079,32038,,NIHYA889LELYV-task2-2020-02-17
64080,32039,,NIHYM875FLXFF-task2-2020-03-02


In [37]:
le = preprocessing.LabelEncoder()
concat_data = list(set(list(data["id1"]) + list(data["id2"])))
le.fit(concat_data)
all_id1 = torch.as_tensor(le.transform(data["id1"]))
all_id2 = torch.as_tensor(le.transform(data["id2"]))

In [31]:
import torch
from torch.nn import MarginRankingLoss

args_margin = 0.05

def rankingLoss(y1_preds, y2_preds, labels):
    '''
    e = (y==0) ? max(0, |y1_pred - y2_pred| - args.margin) : max(0, -label(y1_pred - y2_pred)+args.margin)
    returns mean error
    '''
    outputs = y1_preds - y2_preds
    zero_indices = torch.where(labels==0.0)
    non_zero_indices = torch.where(labels!=0.0)

    outputs_non_zero = outputs[non_zero_indices]
    labels_non_zero = labels[non_zero_indices]
    y1_preds_non_zero = y1_preds[non_zero_indices]
    y2_preds_non_zero = y2_preds[non_zero_indices]

    outputs_zero = outputs[zero_indices]
    labels_zero = labels[zero_indices]

    loss_function = MarginRankingLoss(margin=args_margin)
    L1 = loss_function(y1_preds_non_zero, y2_preds_non_zero, labels_non_zero)*len(labels_non_zero)
    print("Training Loss: \n")
    print(L1)
    L2 = torch.sum(torch.maximum(torch.abs(outputs_zero)-args_margin, labels_zero))
    print(L2)
    L = (L1+L2)/len(labels)
    print(L)
    print("--"*10)
    return L

y1_pred = torch.tensor([0.5, 0.1, 0.995, 0.443], dtype=torch.float64)
y2_pred = torch.tensor([0.56, 0.98, 0.21, 1.0], dtype=torch.float64)
labels = torch.tensor([1, 0, -1, 1], dtype=torch.float64)

loss = rankingLoss(y1_pred, y2_pred, labels)
print(loss)

Training Loss: 

tensor(1.5520, dtype=torch.float64)
tensor(0.8300, dtype=torch.float64)
tensor(0.5955, dtype=torch.float64)
--------------------
tensor(0.5955, dtype=torch.float64)
