In [1]:
class Session():
    def __init__(self, ip, port):
        self.ip = ip
        self.port = port
    def get_url(self):
        return "http://" + self.ip + ":" + self.port

session = Session("127.0.0.1", "8000")
#list_id_dataset = ["c75f663e-d9ee-4f1c-9458-79e92d1c126a"]
list_id_dataset = ["6fa7ffae-a85a-4621-8d4e-82b214b0c743"]



# The gdc_11k dataset

- 11K patients

- Various sites tumor sites

- Genetic + Clinical data


# 1. Reading the data

In [2]:
from smart_broker_api import data_api
from smart_broker_api import statistics_api
from smart_broker_api import visualization_api
from smart_broker_api import preprocessing_api
import plotly.graph_objects as go


tabular_dataset_id = data_api.read_dataset_csvv1(session, list_id_dataset)
data_frame_id = data_api.data_frame_tabular_select_data_frame(session, tabular_dataset_id, "data_frame_0")
print(data_frame_id)

871d501d-d326-43d7-85d9-d32ea90bfa9c


# 2. Grab series to work with in Statistics Lib

In [3]:
series_1_name = "birth_days_to"
series_2_name = "death_days_to"
series_3_name = "has_mutation_MTOR"
list_series_name = [series_1_name, series_2_name, series_3_name]
data_frame_id_2 = data_api.data_frame_select_data_frame(session, data_frame_id, list_series_name)
data_frame_no_nan_id = preprocessing_api.drop_na_data_frame(session, data_frame_id_2)
series_1_id = data_api.data_frame_select_series(session, data_frame_no_nan_id, series_1_name)
print(statistics_api.count(session,  series_1_id))

data_frame_no_nan_wild_id = preprocessing_api.query_data_frame(session, data_frame_no_nan_id, 'has_mutation_MTOR == "Mutated"')
data_frame_no_nan_mute_id = preprocessing_api.query_data_frame(session, data_frame_no_nan_id, 'has_mutation_MTOR == "Wildtype"')


age_days_wild = data_api.data_frame_select_series(session, data_frame_no_nan_wild_id, series_1_name)
survival_days_wild = data_api.data_frame_select_series(session, data_frame_no_nan_wild_id, series_2_name)

age_days_mute = data_api.data_frame_select_series(session, data_frame_no_nan_mute_id, series_1_name)
survival_days_mute = data_api.data_frame_select_series(session, data_frame_no_nan_mute_id, series_2_name)
print(statistics_api.count(session,  survival_days_wild))
print(statistics_api.count(session,  survival_days_mute))


{'count': 2071}
{'count': 64}
{'count': 2007}


# Statitical processes 

- Federated parametric equations

Varriance, Pearson product, T-Test, ect.

- Federated ranking 

Spearman, mann-whitney, kolmogorov, ect.

- Federated learning

Logistic regression, deep learning

# 3. Run Statistics Library

In [9]:
from smart_broker_api import statistics_api
from smart_broker_api import visualization_api
import plotly.graph_objects as go

type_distribution="normalunit"
type_ranking="cdf"
alternative = "greater"
print(statistics_api.count(session,  survival_days_wild))
print(statistics_api.variance(session,  survival_days_wild))
print(statistics_api.skewness(session,  survival_days_wild))
print(statistics_api.kurtosis(session,  survival_days_wild))
print(statistics_api.min_max(session,  survival_days_wild))
print(statistics_api.pearson(session,  age_days_wild, survival_days_wild, alternative))
print(statistics_api.spearman(session,  age_days_wild, survival_days_wild, alternative, type_ranking))

print(statistics_api.levene_test(session,  age_days_wild, survival_days_wild))

print(statistics_api.paired_t_test(session,  age_days_wild, survival_days_wild, alternative))
print(statistics_api.student_t_test(session,  age_days_wild, survival_days_wild, alternative))
print(statistics_api.welch_t_test(session,  age_days_wild, survival_days_wild, alternative))





{'count': 64}
{'variance': 762887.9997519841}
{'skewness': 1.0507502824263952}
{'kurtosis': 0.12217859344120452}
{'min': -276.390625, 'max': 16852.390625}
{'pearson': -0.22640009678695353, 'p_value': 0.9639859817490299}
{'spearman': -0.21305777806668982, 'p_value': 0.9545182691813194}
{'f_statistic': 54.20683596058111, 'p_value': 2.0603652872647306e-11}
{'t_statistic': 44.9964947128369, 'p_value': 0.0}
{'t_statistic': 47.06191456112596, 'p_value': 0.0}
{'t_statistic': 47.061914561125974, 'p_value': 0.0}


# 4. Do some actual research

In [6]:
# Here we engage with two research questions.
# 1. Is there an interaction between age and survival time?
# 2. Do patient with a mutated MTOR gene survive significantly shorter than those with the wildtype?

print("N wildtype")
print(statistics_api.count(session,  age_days_wild))
print("N mutated")
print(statistics_api.count(session,  age_days_mute))

print()
print("Mean survival time wildtype")
print(statistics_api.mean(session,  survival_days_wild))

print("Mean survival time mutated")
print(statistics_api.mean(session,  survival_days_mute))



print("Age impact wildtype")
print(statistics_api.pearson(session,  age_days_wild, survival_days_wild, alternative)["pearson"])

print("Age impact mutated")
print(statistics_api.pearson(session,  age_days_mute, survival_days_mute, alternative)["pearson"])


print("test on mean survival signigicance")
print(statistics_api.welch_t_test(session,  survival_days_wild, survival_days_mute, alternative))


dict_of_fig = visualization_api.kernel_density_estimation(session, survival_days_mute, 200)
fig = go.Figure(dict_of_fig["figure"])
fig.show()

dict_of_fig = visualization_api.kernel_density_estimation(session, survival_days_wild, 200)
fig = go.Figure(dict_of_fig["figure"])
fig.show()

N wildtype
{'count': 64}
N mutated
{'count': 2007}

Mean survival time wildtype
{'mean': 965.484375}
Mean survival time mutated
{'mean': 829.300946686597}
Age impact wildtype
-0.22640009678695353
Age impact mutated
-0.22221416383224207
test on mean survival signigicance
{'t_statistic': 1.22409137227908, 'p_value': 0.1125734754672818}
