# Explore and preprocesss validation data

- results in translation of PerMIT to RKZ variables: `data/discovery_validation_variables_translation.json`
- align variables to german myaReg to enable validation of model

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
sns.set_theme(style="whitegrid", font_scale=1.3, context="paper")

from sklearn.preprocessing import MinMaxScaler

PATH_base = "/home/WUR/katz001/PROJECTS/myaReg-genderDifferences"

In [4]:
data = pd.read_csv(f"{PATH_base}/data/validation/Dutch MG patients V2.csv", sep=";",decimal=",", index_col=0)

data.shape

(419, 38)

## Inverse gender (1=female, 0=male)

Original: 1=male, 2=female

- `gender`: 
    - 1: male
    - 2: female

In [4]:
dic_gender = {1:0, 
              2:1}

data["gender"] = data["gender"].replace(dic_gender)


# Create new achrak subgroup from `MGsubgroup_inclPATreported`

- new binary variable `MGsubgroup_inclPATreported_achrak`:
    - -1: 0
    - 1: 1
    - 2: 0
    - 3: 0
    - 8: np.nan


- `MGsubgroup_inclPATreported`:
    - -1: MG SN
    - 1: MG achrak
    - 2: MG MUSK
    - 3: MG LRP4
    - 8: MG unkown AB

In [5]:
dic_achrak = {
    -1: 0,
    1: 1,
    2: 0,
    3: 0,
    8: np.nan
}

In [6]:
data['MGsubgroup_inclPATreported'].value_counts()

MGsubgroup_inclPATreported
 1    304
-1     54
 8     46
 2     14
 3      1
Name: count, dtype: int64

In [7]:
data['MGsubgroup_inclPATreported_achrak'] = data['MGsubgroup_inclPATreported'].copy()
data['MGsubgroup_inclPATreported_achrak'].replace(dic_achrak, inplace=True)
data['MGsubgroup_inclPATreported_achrak'].value_counts()


MGsubgroup_inclPATreported_achrak
1.0    304
0.0     69
Name: count, dtype: int64

# Update missing -999 as `np.nan`

In [8]:
data.replace(-999, np.nan, inplace=True)
data.replace(" ", np.nan, inplace=True)

# New depression score for validation: HADSd + HADSa

In [9]:
data['HADS_comb'] = data['HADSd'] + data['HADSa']

# Normalisation of Chronic Fatigue 

minmax --> max = 56; min = 8


In [10]:
data["CISfatigue"]

IDAA
2193    48
2056    12
2211    56
2234    31
1986     9
        ..
1996    42
2098    43
2307    35
2358    40
2402    33
Name: CISfatigue, Length: 419, dtype: int64

In [11]:
scaler = MinMaxScaler(feature_range=(0,1))
cis_normalised = scaler.fit_transform(data.loc[:,"CISfatigue":"CISfatigue"])

### Save scaler as pickle for inverse transform later
with open(f"{PATH_base}/data/validation/CF_minmaxScaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [26]:
data["CISfatigue_minmax"] = cis_normalised

# Update binary categories for: 

- achr-ak (original: 1=yes, 2=no, 3=uncertain)
- thymectomy (original: 1=yes, 2=no)

In [27]:
data["QantiACHR"].value_counts()

QantiACHR
1.0    178
3.0    165
2.0     56
Name: count, dtype: int64

In [28]:
dic_tmp = {1:1, 
           2:0,
           3:np.nan}

data["QantiACHR"] = data["QantiACHR"].replace(dic_tmp)
data["Thymectomy"] = data["Thymectomy"].replace(dic_tmp)
data["QantiMUSK"] = data["QantiMUSK"].replace(dic_tmp)

# Remove BMI  

In [29]:
data.drop(["BMI"], axis=1, inplace=True)

# Save new dataset

In [30]:
data.to_csv(f"{PATH_base}/data/validation/dutch_MG_patients_V2_recoded.csv")