In [14]:
import datetime
import pandas as pd 
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
#load data
rawdata = pd.read_csv(r"C:\Users\smeng3\Desktop\Summer_Proj\Proctcae301.csv",low_memory=False) 

In [15]:
# pre processing on data type including str to num, range changes (value 4 merged to 3)
def pre_processing(data,columns):
    data_slides = []
    for i in range(len(columns)):
        data[columns[i]] = pd.to_numeric(data[columns[i]], errors='coerce')
        if i > 1:
            data[columns[i]]=data[columns[i]]-1
        data.loc[(data[columns[0]] == 1)&(data[columns[i]].isnull()), columns[i]] = 0
        data.loc[(data[columns[i]] == 4), columns[i]] = 3
        if i > 0:
            data_slides.append(data[["gapid",columns[i],"Assessment"]])
    return data_slides

In [16]:
data_slides = pre_processing(rawdata.copy(),["PRONever3","PRO3a","PRO3b","PRO3c"])

In [17]:
# add time lable into dataframe
def tsp_time(data_slides):
    results = []
    for data in data_slides:
        name = ["_Baseline","_4_6_weeks","_3_month","_6_month"]
        for i in [1,2,3]:
            if i ==1:
                res = data[(data.loc[:,"Assessment"]==i)].iloc[:, :-1].set_index('gapid')
            res_b = data[(data.loc[:,"Assessment"]==i+1)].iloc[:, :-1]
            res = res.join(res_b.set_index('gapid'), lsuffix=name[i-1], rsuffix=name[i])
        results.append(res)
    return results

In [18]:
results = tsp_time(data_slides)

In [19]:
# check missing value (ex:patient have 4-6 weeks data but no baseline data)
def err_check(results):
    err_results ,drop_out_summary = [], []
    for df in results:
        err_1 = df[(df.iloc[:,0].isnull()) & ((df.iloc[:,1].isnull()==False)|(df.iloc[:,2].isnull()==False)|(df.iloc[:,3].isnull()==False))]
        err_2 = df[(df.iloc[:,1].isnull()) & ((df.iloc[:,2].isnull()==False)|(df.iloc[:,3].isnull()==False))]
        err_3 = df[(df.iloc[:,2].isnull()) & (df.iloc[:,3].isnull()==False)]
        err = err_1.append(err_2).append(err_3)
        drop_out_1 = df[(df.iloc[:,0].isnull()==False)&((df.iloc[:,1].isnull())&(df.iloc[:,2].isnull())&(df.iloc[:,3].isnull()))]
        drop_out_2 = df[(df.iloc[:,0].isnull()==False)&((df.iloc[:,1].isnull()==False)&(df.iloc[:,2].isnull())&(df.iloc[:,3].isnull()))]
        drop_out_3 = df[(df.iloc[:,0].isnull()==False)&((df.iloc[:,1].isnull()==False)&(df.iloc[:,2].isnull()==False)&(df.iloc[:,3].isnull()))]
        data = {'Number of patients':[len(drop_out_1.index),len(drop_out_2.index),len(drop_out_3.index)]} 
        drop_out_data = pd.DataFrame(data, index =['After Baseline', 'After 4-6 Weeks', 'After 3 Month']) 
        err_results.append(err)
        drop_out_summary.append(drop_out_data)
    return err_results,drop_out_summary

In [20]:
err_results,drop_out_summary = err_check(results.copy())

In [21]:
# detect patient drop out
def drop_out_check(results):
    new_result = results.copy()
    for each in new_result:
        each.loc[(each.iloc[:,0].isnull()== False)&(each.iloc[:,1].isnull())\
                 &(each.iloc[:,2].isnull())&(each.iloc[:,3].isnull()),each.columns[1]] = 4
        each.loc[(each.iloc[:,0].isnull()== False)&(each.iloc[:,1].isnull()== False)\
                 &(each.iloc[:,2].isnull())&(each.iloc[:,3].isnull()),each.columns[2]] = 4
        each.loc[(each.iloc[:,0].isnull()== False)&(each.iloc[:,1].isnull()== False)\
                 &(each.iloc[:,2].isnull()== False)&(each.iloc[:,3].isnull()),each.columns[3]] = 4
    return new_result

In [22]:
results = drop_out_check(results)

In [23]:
# extract links from dataframe for sankey diagram
def get_link (df,num):
    column,column_len= df.columns, num
    
    res_1 = (df).groupby([column[0],column[1]]).size().reset_index(name='counts')
    res_2 = (df).groupby([column[1],column[2]]).size().reset_index(name='counts')
    res_3 = (df).groupby([column[2],column[3]]).size().reset_index(name='counts')

    source = np.append(np.append(res_1.iloc[:,0].values,res_2.iloc[:,0].values+(column_len)),res_3.iloc[:,0].values+(column_len*2+1))
    target = np.append( np.append(res_1.iloc[:,1].values+(column_len),res_2.iloc[:,1].values+(column_len*2+1)),res_3.iloc[:,1].values+(column_len*3+2))
    value = np.append(np.append(res_1.iloc[:,2].values,res_2.iloc[:,2].values),res_3.iloc[:,2].values)
    
    err_source = np.append(np.append(res_1.iloc[:,0].values,res_2.iloc[:,0].values),res_3.iloc[:,0].values)

    err_target = np.append(np.append(res_1.iloc[:,1].values,res_2.iloc[:,1].values),res_3.iloc[:,1].values)
    
    link = pd.DataFrame({'source':source,'target':target,"value":value,"err_source":err_source,"err_target":err_target})
    
    link["value_rank"]=link.groupby(["target"])["value"].rank(method='dense').astype(int)
    link["overall_rank"]=link["value"].rank(method='dense').astype(int)
    link["rank"]=link["overall_rank"]-link["value_rank"]+1
    link["alpha"]=1/((link["rank"])**(1/2))
    
    node = pd.DataFrame({"source":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18],
                     "node_color":[px.colors.colorbrewer.Paired[3],px.colors.colorbrewer.Set1[5],px.colors.colorbrewer.Paired[7],px.colors.colorbrewer.Paired[5],
                                   px.colors.colorbrewer.Paired[3],px.colors.colorbrewer.Set1[5],px.colors.colorbrewer.Paired[7],px.colors.colorbrewer.Paired[5],px.colors.carto.Prism[10],
                                   px.colors.colorbrewer.Paired[3],px.colors.colorbrewer.Set1[5],px.colors.colorbrewer.Paired[7],px.colors.colorbrewer.Paired[5],px.colors.carto.Prism[10],
                                   px.colors.colorbrewer.Paired[3],px.colors.colorbrewer.Set1[5],px.colors.colorbrewer.Paired[7],px.colors.colorbrewer.Paired[5],px.colors.carto.Prism[10]]
                        })
    link = link.join(node.set_index('source'), on='source')

    link["alpha"] = link["alpha"].astype(str)

    link["link_color"]=link["node_color"].str[:3]+"a"+link["node_color"].str[3:-1]+","+link["alpha"]+")"
    
    return source,target,value,node,link

In [24]:
results[0] = results[0].copy().drop(err_results[0].index)
results[1] = results[1].copy().drop(err_results[1].index)
results[2] = results[2].copy().drop(err_results[2].index)

In [25]:
# plot graph
def ploting(label,title,link,node):
    fig = go.Figure(data=[go.Sankey(visible = True,arrangement = "perpendicular",valuesuffix = " Patients",node = dict(pad = 50,thickness = 20,line = dict(color = "black", width = 0.5),label = label, color = node["node_color"]),
    link = dict(source = link["source"],target = link["target"] ,value = link["value"] ,color= link["link_color"].values))])
    fig.update_layout(title=title,height = 1080,width = 1920,font = dict(size = 20), 
                      annotations=[go.layout.Annotation(x=-0.02,y=-0.15,showarrow=False,text="Baseline",xref="paper",yref="paper"),
                                   go.layout.Annotation(x=0.30,y=-0.15,showarrow=False,text="4-6 Weeks",xref="paper",yref="paper"),
                                   go.layout.Annotation(x=0.74,y=-0.15,showarrow=False,text="3 Month Follow-up",xref="paper",yref="paper"),
                                   go.layout.Annotation(x=1.05,y=-0.15,showarrow=False,text="6 Month Follow-up",xref="paper",yref="paper")])
    fig.show()

In [26]:
label = ["Never","Rarely","Occasionally","Frequently - Almost Constantly",
         "Never","Rarely","Occasionally","Frequently - Almost Constantly","DropOut",
         "Never","Rarely","Occasionally","Frequently - Almost Constantly","DropOut",
         "Never","Rarely","Occasionally","Frequently - Almost Constantly","Dropout"]
title = "Frequency of patient having headaches overtime<br>GAP dataset (n:285)"

source , target, value,node,link = get_link(results[2],4)
ploting(label,title,link,node)