In [1]:
import pandas as pd
import numpy as np

##### To understand our dataset even better, we built a Markov chain model of KLG progression. To define, a Markov chain is a stochastic model in which the probabilities of occurrence of various future states depend only on the present state of the system or on the immediately preceding state and not on the path by which the present state was achieved.

### To build the model: 
- we grouped the dataset by ID so that entire data for a subject can be gathered at one place. 
- Then from the first occurrence of KLG for an ID, we compared the next occurrence and captured it in a different column named CHANGE. If the KLG has increased, the column value would be positive, negative if decreased, otherwise zero. 
- After that, we made a table showing the previous KLG and the change. 
- Finally, we constructed a transition matrix and calculated the probabilities to obtain the Markov chain model fit. 

### Depicting a change in KLG of patients over different timepoints

In [2]:
data = pd.read_csv("data/knee_features.csv")
data = data.dropna()
data = data.drop_duplicates()
data = data.sort_index().sort_values(by = ['ID','LATERALITY'], kind='mergesort')
data['CHANGE'] = data.groupby(['ID', 'LATERALITY'])['KLG'].diff()
data

Unnamed: 0,ID,TIMEPOINT,LATERALITY,LATCOV,MEDCOV,LATEXTR,MEDEXTR,MTCVOL,LTCVOL,MM_AREA,RATIO_MM,LM_AREA,RATIO_LM,FC_VOLUME,TC_VOLUME,MM_VOL,LM_VOL,KLG,CHANGE
4706,9000099,v00,LEFT,0.493013,0.428040,-0.640030,1.32846,1473.83,1753.52,1521.47,0.92,2344.13,0.94,15118.46,3227.35,1741.98,2833.86,3.0,
23140,9000099,v24,LEFT,0.275416,0.406554,-1.410290,1.34173,1487.32,1735.19,1469.21,0.92,2786.97,0.76,15117.07,3222.51,1684.67,1772.59,4.0,1.0
35401,9000099,v48,LEFT,0.374558,0.434621,-2.390010,1.37768,1577.39,1712.30,1566.39,0.88,2550.24,0.98,14779.31,3289.69,1869.08,1835.03,4.0,0.0
0,9000099,v00,RIGHT,0.552836,0.240201,-1.288910,2.84262,1552.54,2408.74,1352.04,1.14,2300.80,0.77,14660.77,3961.29,1275.83,3122.21,2.0,
11642,9000099,v12,RIGHT,0.565426,0.276691,-1.302090,3.44997,1443.40,2270.20,1347.23,1.10,2249.33,0.82,14459.89,3713.60,1309.33,2894.06,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45935,9999878,v96,LEFT,0.768125,0.500186,-0.571762,1.54050,1295.00,1264.29,1905.72,0.75,2036.60,0.75,11974.11,2557.71,2671.78,2834.89,1.0,0.0
4705,9999878,v00,RIGHT,0.620649,0.501953,-1.959510,1.60023,1456.43,1606.04,2033.74,0.75,2108.96,0.75,13284.09,3062.47,2858.05,2933.98,2.0,
11641,9999878,v12,RIGHT,0.631068,0.497569,-2.110100,2.04829,1472.99,1490.30,2039.78,0.74,2113.66,0.76,13105.16,2963.29,2877.87,2909.41,2.0,0.0
26664,9999878,v36,RIGHT,0.618281,0.498660,-2.414310,3.60272,1442.47,1530.03,2116.02,0.72,2162.39,0.76,13022.16,2972.50,3061.73,2994.83,2.0,0.0


### Crosstab that depicts that particular change

In [3]:
pd.crosstab([data.KLG], data.CHANGE)

CHANGE,-3.0,-2.0,-1.0,0.0,1.0,2.0,3.0,4.0
KLG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,0,19,65,13141,0,0,0,0
1.0,1,0,63,5845,364,0,0,0
2.0,0,0,41,7334,406,157,0,0
3.0,0,0,44,3671,447,94,45,0
4.0,0,0,0,953,326,16,5,4


#### The above table can be read as follows. Given a KLG level of 0 at one timepoint, there were no changes at the next timepoint in 13141 observations. In 65 instances, the KLG decreased from 1 to 0 at the next timepoint. Finally, in 19 of the observations, the KLG was previously 2 and became 0 at the next timepoint.

In [4]:
transition_matrix = np.array([
    [13141, 364, 157, 45, 4],
    [65, 5845, 406, 94, 5],
    [19, 63, 7334, 447, 16],
    [0, 0, 41, 3671, 326], 
    [0, 1, 0, 44, 953]
])
transition_matrix = pd.DataFrame(transition_matrix, columns = ['NEW KLG 0','NEW KLG 1','NEW KLG 2', 'NEW KLG 3', 'NEW KLG 4'], index= ['OLD KLG 0','OLD KLG 1','OLD KLG 2', 'OLD KLG 3', 'OLD KLG 4'])
transition_matrix

Unnamed: 0,NEW KLG 0,NEW KLG 1,NEW KLG 2,NEW KLG 3,NEW KLG 4
OLD KLG 0,13141,364,157,45,4
OLD KLG 1,65,5845,406,94,5
OLD KLG 2,19,63,7334,447,16
OLD KLG 3,0,0,41,3671,326
OLD KLG 4,0,1,0,44,953


In [5]:
transition_matrix = np.array([
    [13141, 364, 157, 45, 4],
    [65, 5845, 406, 94, 5],
    [19, 63, 7334, 447, 16],
    [0, 0, 41, 3671, 326], 
    [0, 1, 0, 44, 953]
])
transition_matrix = pd.DataFrame(transition_matrix)
transition_matrix

Unnamed: 0,0,1,2,3,4
0,13141,364,157,45,4
1,65,5845,406,94,5
2,19,63,7334,447,16
3,0,0,41,3671,326
4,0,1,0,44,953


### Calculating the markov chain probabilites

In [6]:
new_matrix = np.zeros([5,5])

sum_matrix = pd.DataFrame(transition_matrix.sum(axis=1))

for i in range(len(transition_matrix)):
    for j in range(len(transition_matrix[0])):
        new_matrix[i][j] = transition_matrix[j][i] / sum_matrix[0][i]
new_matrix = pd.DataFrame(new_matrix)
new_matrix

Unnamed: 0,0,1,2,3,4
0,0.958428,0.026548,0.011451,0.003282,0.000292
1,0.010133,0.911146,0.063289,0.014653,0.000779
2,0.002411,0.007996,0.930829,0.056733,0.002031
3,0.0,0.0,0.010154,0.909113,0.080733
4,0.0,0.001002,0.0,0.044088,0.95491


## A Visual Depiction of the markov chain probabilties

![](data/markovchain.png)