In [1]:
# Author: Rute Ayalew
# Date: 11/5/23
# Collaborators: 

In [None]:
# Package Installations 
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install rds2py
!pip install kneed
!pip install yellowbrick
!pip install imblearn
!pip install statsmodels
!pip install dask
!pip install polars

In [3]:
# Package Import
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import rds2py
import yellowbrick
import kneed
import imblearn
import os
import statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from dask.dataframe import from_pandas
import polars as pl

# Self-Defined Module Import
import Rdata_understanding as du
import Rdata_prep as dp
import Rdata_model as dm
import Rmy_utils as mu


# temp imports
import requests
from io import BytesIO
import gzip


In [4]:
# User-Initialized Variables to guide Data Mining Process
target_attribute = 'disease' 
task_type = 'classification' 
random_seed = 45
test_size = 0.3 

# User-Initialized Variables to guide file uploads and downloads
input_dir = '../data' # path for user's pre-existing data files to read
output_dir = '../output' # direct csv downloads; to be updated with new subdirectories for each task 
read_type = 'url' # or 'disk'

url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE185nnn/GSE185948/suppl/GSE185948_metadata_RNA.csv.gz"
file_name1 = 'demographic_clinical_data.csv' # user chosen file name for upload from url
file_name2 = 'RDS_data.csv'
file_path = "GSE185948_count_RNA.rds" # file name in user's disk

# For constructing RDS data into readable arrays/dataframes
dim_0 = 'gene_name' # rows
dim_1 = 'cell_name' # cols

In [5]:
# Read demographic csv file from url
df1 = mu.get_data(input_dir, url = url, file_name = file_name1,read_type='url')

<<<<<<<<<< URL File Read Successfully >>>>>>>>>>
<<<<<<<<<< Processed data saved to ../data/demographic_clinical_data.csv >>>>>>>>>>


In [6]:
# Read rds file from local disk
df2 = mu.get_data(input_dir, file_name = file_name2, file_path= file_path, read_type='disk')

<<<<<<<<<< RDS File Read Successfully >>>>>>>>>>


In [7]:
df1

name,barcode,patient,gender,disease,celltype,nCount_RNA,nFeature_RNA,UMAP_1,UMAP_2
str,str,str,str,str,str,f64,i64,f64,f64
"""PKD_ACACGCGGTA…","""ACACGCGGTATCGG…","""PKD1""","""female""","""PKD""","""TAL1""",1234.684629,1250,-7.41602,-6.008874
"""PKD_ACACGCGGTT…","""ACACGCGGTTTGGC…","""PKD1""","""female""","""PKD""","""PT2""",1865.542588,1650,2.499409,-6.587287
"""PKD_ACACGCGTCA…","""ACACGCGTCATGTC…","""PKD1""","""female""","""PKD""","""CNT_PC""",1812.700523,1419,-2.254505,8.526364
"""PKD_ACACTGAAGA…","""ACACTGAAGACCCT…","""PKD1""","""female""","""PKD""","""FIB""",978.772591,1089,9.949437,2.086737
"""PKD_ACACTGAAGC…","""ACACTGAAGCGACA…","""PKD1""","""female""","""PKD""","""PT1""",2361.558871,2013,7.886921,-8.587954
"""PKD_ACACTGAAGT…","""ACACTGAAGTTCGG…","""PKD1""","""female""","""PKD""","""PT1""",1022.824714,1016,7.670309,-7.718168
"""PKD_ACACTGACAA…","""ACACTGACAAGGTA…","""PKD1""","""female""","""PKD""","""DCT""",2482.698271,1877,-8.110249,5.433267
"""PKD_ACACTGACAC…","""ACACTGACACCGTC…","""PKD1""","""female""","""PKD""","""FIB""",1088.176821,1232,8.026856,1.465231
"""PKD_ACACTGACAC…","""ACACTGACACCTTC…","""PKD1""","""female""","""PKD""","""DCT""",2553.667284,2006,-9.598119,6.063181
"""PKD_ACACTGACAG…","""ACACTGACAGAGCC…","""PKD1""","""female""","""PKD""","""TAL1""",1515.820655,1499,-7.198015,-4.930681


In [8]:
df2

PKD_ACACGCGGTATCGGTT-1_1,PKD_ACACGCGGTTTGGCTA-1_1,PKD_ACACGCGTCATGTCTT-1_1,PKD_ACACTGAAGACCCTTA-1_1,PKD_ACACTGAAGCGACAGT-1_1,PKD_ACACTGAAGTTCGGTT-1_1,PKD_ACACTGACAAGGTACG-1_1,PKD_ACACTGACACCGTCTT-1_1,PKD_ACACTGACACCTTCGT-1_1,PKD_ACACTGACAGAGCCCT-1_1,PKD_ACACTGAGTACCTAAC-1_1,PKD_ACACTGATCACAGAGG-1_1,PKD_ACACTGATCACTTGGA-1_1,PKD_ACACTGATCGCCAGAC-1_1,PKD_ACAGAAAAGATCACTC-1_1,PKD_ACAGAAAAGGTTCTAC-1_1,PKD_ACAGAAAAGTACCATC-1_1,PKD_ACAGAAACAATCAAGA-1_1,PKD_ACAGAAACACAAACGG-1_1,PKD_ACAGAAACAGCTCCTT-1_1,PKD_ACAGAAAGTAAGCAAT-1_1,PKD_ACAGAAAGTACCGTCG-1_1,PKD_ACAGAAAGTCCGGTGT-1_1,PKD_ACAGAAAGTCTTTCAT-1_1,PKD_ACAGAAATCCGGACTG-1_1,PKD_ACAGAAATCTCGCAGG-1_1,PKD_ACAGCCGAGATAACGT-1_1,PKD_ACAGCCGAGCGACTAG-1_1,PKD_ACAGCCGAGCGCGTTC-1_1,PKD_ACAGCCGAGCTCCATA-1_1,PKD_ACAGCCGAGGGCAGTT-1_1,PKD_ACAGCCGCACACCGCA-1_1,PKD_ACAGCCGCACGGTGTC-1_1,PKD_ACAGCCGCAGATCATC-1_1,PKD_ACAGCCGGTACTGCGC-1_1,PKD_ACAGCCGGTTAGTCGT-1_1,PKD_ACAGCCGTCACCATGA-1_1,…,Cont_TTTGATCAGCTGGCCT-1_5,Cont_TTTGATCCAAGTCCAT-1_5,Cont_TTTGATCCAGCTGTTA-1_5,Cont_TTTGATCGTACTTGTG-1_5,Cont_TTTGATCGTATGCTAC-1_5,Cont_TTTGATCGTCTTCGAA-1_5,Cont_TTTGATCGTTGCGGAA-1_5,Cont_TTTGATCTCGCGTGCA-1_5,Cont_TTTGATCTCGGTTCAA-1_5,Cont_TTTGGAGAGCATAGGC-1_5,Cont_TTTGGAGAGTAACCTC-1_5,Cont_TTTGGAGAGTGTTGTC-1_5,Cont_TTTGGAGCAAATCGGG-1_5,Cont_TTTGGAGCACAGTCCG-1_5,Cont_TTTGGAGCACTCTAGA-1_5,Cont_TTTGGAGCAGAACTCT-1_5,Cont_TTTGGAGCATCGCTGG-1_5,Cont_TTTGGAGGTACCCAGC-1_5,Cont_TTTGGAGGTATCGTTG-1_5,Cont_TTTGGAGTCGAATCCA-1_5,Cont_TTTGGAGTCTCTATGT-1_5,Cont_TTTGGTTAGGTCCCGT-1_5,Cont_TTTGGTTAGTAAAGCT-1_5,Cont_TTTGGTTCAACCGCCA-1_5,Cont_TTTGGTTCAATCCTAG-1_5,Cont_TTTGGTTGTCCACACG-1_5,Cont_TTTGGTTGTTTCACAG-1_5,Cont_TTTGGTTTCCGCAACG-1_5,Cont_TTTGGTTTCTAGTCAG-1_5,Cont_TTTGTTGAGTCGTCTA-1_5,Cont_TTTGTTGCACAAGCCC-1_5,Cont_TTTGTTGCACGTTCGG-1_5,Cont_TTTGTTGCAGTTAAAG-1_5,Cont_TTTGTTGCATATTCGG-1_5,Cont_TTTGTTGGTACGATTC-1_5,Cont_TTTGTTGGTGCGTTTA-1_5,Cont_TTTGTTGTCTGCGGAC-1_5
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.680108,0.0,0.0,0.770796,0.0,0.0,0.482712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.61749,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.795865,0.0,0.0,0.0,0.771729,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.719765,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.875772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.346982,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **TASK 1: BASELINE**
>### **Phase 2**
>Cleaning the RDS file and reformatting with mapping and aggregation

In [21]:
'''# mapping new attributes from other dataframes function    
def df_mapping(df1, df2, mapping_attribute = None, key_attribute = None):
    new_df = df2
    new_vals = {}
    count = 1
    unique_instances = df1[mapping_attribute].unique()
    mapping = dict(zip(df1[key_attribute], df1[mapping_attribute]))
    for i in df2.columns:
        new_vals[i] = str(mapping[i]+','+ str(count))
        count = count+1
    print(new_vals)
    new_df = new_df.rename(new_vals)
    #new_df.with_columns(pl.Series(name=mapping_attribute, values=map_values)) 
    #new_df[mapping_attribute] = map_values
    #if new_index == True:
        #new_index = ["{}_{}".format(key_attribute, mapping_attribute) for key_attribute, mapping_attribute in zip(transpose_df.index, map_values)]
        #new_df.index = new_index
    return new_df

def unique_group(df1, grouping_attribute, df2=None):

    # create dictionary of unique instances of passed attribute in first dataframe
    unique_instances = df1[grouping_attribute].unique() 
    grouped_columns = {grouping_attribute: [] for grouping_attribute in unique_instances}
    
    # fill dictionary with second dataframe's data
    if df2 is not None: 
        for col in df2.columns:
            for grouping_attribute in unique_instances:
                if grouping_attribute in col:
                    grouped_columns[grouping_attribute].append(col)
                    break
    # fill dictionary with first dataframe's data
    else:
        for col in df1.columns:
            for grouping_attribute in unique_instances:
                if grouping_attribute in col:
                    grouped_columns[grouping_attribute].append(col)
                    break
    return grouped_columns
    

def aggregate_by_groups(df, grouped_columns):
    # Create an empty Polars DataFrame to store the aggregated results
    aggregated_df = pd.DataFrame()
    
    for group_name, cols in grouped_columns.items():
        if cols:  # Check if any columns exist for that group
            # Calculate the mean for the selected columns
            #mean_values = df.select(*cols).mean(axis=1)
            mean_values = (df.select(cols).mean(axis=1)).to_pandas()
            aggregated_df[group_name] = mean_values

    return aggregated_df'''

In [None]:
# update output path to subdirectory of 'data' for Task 1
output_path = os.path.join(output_dir, 'task1')

#transpose raw molecular dataframe to map 'patient' attributes from df1
#transpose_df = df2.T
mapped_df2 = dp.df_mapping(df1,df2, mapping_attribute = 'patient', key_attribute = 'name')
mapped_df2

In [11]:
mapped_df2.shape

(27970, 102710)

In [None]:
grouped = dp.unique_group(df1, 'patient', df2=mapped_df2)
grouped

In [13]:
!pip install pyarrow



In [23]:
aggregated_df2 = pd.DataFrame(dp.aggregate_by_groups(mapped_df2, grouped))
aggregated_df2

Unnamed: 0,PKD4,PKD8,control2,control5,control3,PKD6,control4,PKD3,PKD1,PKD7,control1,PKD2,PKD5
0,0.002954,0.004001,0.005165,0.002579,0.005716,0.002991,0.005334,0.001086,0.000753,0.000000,0.003540,0.001885,0.001941
1,0.007549,0.032600,0.011323,0.006374,0.012446,0.007005,0.015720,0.001668,0.004445,0.008997,0.014732,0.007223,0.010352
2,0.022174,0.112791,0.079155,0.032347,0.084424,0.068845,0.117683,0.044862,0.042604,0.068695,0.066192,0.056001,0.041855
3,0.018220,0.046928,0.023825,0.013867,0.020626,0.019986,0.067427,0.018115,0.006373,0.026209,0.031483,0.026931,0.021822
4,0.000547,0.001977,0.009599,0.003132,0.006503,0.003146,0.007133,0.000683,0.002645,0.001423,0.005242,0.001561,0.002424
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27965,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003724,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
27966,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000559,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
27967,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001627,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
27968,0.000000,0.000000,0.000000,0.000901,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [24]:
aggregated_df2 = aggregated_df2.reindex(sorted(aggregated_df2.columns), axis=1)
aggregated_df2

Unnamed: 0,PKD1,PKD2,PKD3,PKD4,PKD5,PKD6,PKD7,PKD8,control1,control2,control3,control4,control5
0,0.000753,0.001885,0.001086,0.002954,0.001941,0.002991,0.000000,0.004001,0.003540,0.005165,0.005716,0.005334,0.002579
1,0.004445,0.007223,0.001668,0.007549,0.010352,0.007005,0.008997,0.032600,0.014732,0.011323,0.012446,0.015720,0.006374
2,0.042604,0.056001,0.044862,0.022174,0.041855,0.068845,0.068695,0.112791,0.066192,0.079155,0.084424,0.117683,0.032347
3,0.006373,0.026931,0.018115,0.018220,0.021822,0.019986,0.026209,0.046928,0.031483,0.023825,0.020626,0.067427,0.013867
4,0.002645,0.001561,0.000683,0.000547,0.002424,0.003146,0.001423,0.001977,0.005242,0.009599,0.006503,0.007133,0.003132
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27965,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.003724,0.000000
27966,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000559,0.000000
27967,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001627,0.000000
27968,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000901


### Split train & Test then Pipeline for pre-Phase 3

In [28]:
from sklearn.model_selection import train_test_split
def split_data(df, test_size=0.3, random_state=None):
    #Splits a DataFrame into training and test sets.

    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    return train_df, test_df

In [29]:
train_df, test_df = split_data(aggregated_df2)
train_df

Unnamed: 0,PKD1,PKD2,PKD3,PKD4,PKD5,PKD6,PKD7,PKD8,control1,control2,control3,control4,control5
4682,0.058530,0.115151,0.049917,0.031766,0.099263,0.148055,0.060874,0.079229,0.181459,0.206016,0.137439,0.223114,0.089336
5478,0.005232,0.093129,0.010274,0.007166,0.008140,0.034626,0.077727,0.000000,0.000989,0.005326,0.001711,0.000400,0.000000
21732,0.001375,0.001636,0.000295,0.000000,0.001256,0.004061,0.000000,0.000000,0.005882,0.003810,0.003133,0.006256,0.001839
24868,0.000000,0.000243,0.000000,0.000737,0.000622,0.000701,0.000000,0.000000,0.000000,0.001361,0.000406,0.000996,0.000000
18199,0.018895,0.006694,0.022041,0.005042,0.003015,0.039180,0.015758,0.089860,0.002654,0.004407,0.002012,0.002453,0.001323
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16990,0.021699,0.070669,0.019897,0.016170,0.089503,0.083157,0.025825,0.063774,0.111850,0.144766,0.066968,0.145112,0.049720
24300,0.000000,0.000916,0.000000,0.000000,0.000542,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14980,0.041159,0.048345,0.039836,0.021471,0.064940,0.032385,0.029096,0.030616,0.119178,0.161531,0.106662,0.160736,0.082489
18383,0.000514,0.001109,0.002086,0.000000,0.003125,0.002497,0.000000,0.000000,0.001009,0.002684,0.000000,0.001674,0.001158


In [41]:

def drop_duplicates(df): 
    print('\nDuplicate row removal:')
    print('Sample count before: ', len(df.index))
    df_no_duplicates = df.drop_duplicates()
    print('Sample count after: ', len(df_no_duplicates.index))

    return df_no_duplicates
def drop_null(df):
     # Drop rows with null values
    df_no_nulls = df.dropna()

    # Drop rows with 0 entries
    df_no_nulls = df_no_nulls[(df_no_nulls != 0).all(axis=1)]
    print('\nNull row removal:')
    print('Sample count before: ', len(df.index))
    print('Sample count after: ', len(df_no_nulls.index))
    
    return df_no_nulls

def drop_out_of_domain(df, std = 2):
    # Get only numeric data to identify rows with out-of-domain properties
    df_numeric = numeric_only(df)
    
    # Calculate the mean and standard deviation of all row means
    print('\nOut-of-domain row removal:')
    all_rows_mean = df_numeric.mean(axis=1)
    all_rows_mean_mean = all_rows_mean.mean()
    all_rows_mean_std = all_rows_mean.std()
    threshold_std = std
    threshold = all_rows_mean_mean + threshold_std * all_rows_mean_std
    print('Threshold =', threshold_std, ' standard deviations')

    out_of_domain_indices = []
    
    # Iterate over rows and check for out-of-domain properties
    for idx, row in df_numeric.iterrows():
        row_mean = row.mean()
        if row_mean > threshold:
            out_of_domain_indices.append(idx)
            #print(f"Row {idx} has out-of-domain properties.") 
    
    df_reduced = df.drop(out_of_domain_indices)
    print('Number of rows with out-of-domain properties: ',len(out_of_domain_indices))
    print('\nSample count before: ', len(df.index))
    print('Sample count after: ', len(df_reduced.index))

    return df_reduced

def numeric_only(df):
    """
    Create a copy of the DataFrame with only numeric columns.
    """
    # Select numeric columns
    numeric_columns = df.select_dtypes(include='number')
    
    # Create a copy with only numeric columns
    df_numeric = numeric_columns.copy()
    
    return df_numeric

In [42]:
train_clean = drop_out_of_domain(drop_null(drop_duplicates(train_df)))


Duplicate row removal:
Sample count before:  19579
Sample count after:  19330

Null row removal:
Sample count before:  19330
Sample count after:  10762

Out-of-domain row removal:
Threshold = 2  standard deviations
Number of rows with out-of-domain properties:  20

Sample count before:  10762
Sample count after:  10742


In [43]:
train_clean

Unnamed: 0,PKD1,PKD2,PKD3,PKD4,PKD5,PKD6,PKD7,PKD8,control1,control2,control3,control4,control5
4682,0.058530,0.115151,0.049917,0.031766,0.099263,0.148055,0.060874,0.079229,0.181459,0.206016,0.137439,0.223114,0.089336
18199,0.018895,0.006694,0.022041,0.005042,0.003015,0.039180,0.015758,0.089860,0.002654,0.004407,0.002012,0.002453,0.001323
20389,0.027754,0.066401,0.034147,0.026015,0.059349,0.091447,0.043421,0.053518,0.108313,0.135050,0.097840,0.150244,0.060836
21797,0.132074,0.240895,0.186061,0.102726,0.282600,0.370735,0.207769,0.347986,0.306554,0.328283,0.252108,0.341737,0.159480
2087,0.001342,0.005579,0.005105,0.001497,0.003724,0.001917,0.003834,0.002363,0.002789,0.004038,0.002666,0.003522,0.001846
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8680,0.022339,0.151898,0.055156,0.033048,0.023833,0.041858,0.176863,0.023139,0.034133,0.123499,0.041477,0.197250,0.030462
3815,0.000868,0.014163,0.001706,0.010684,0.007468,0.003278,0.011897,0.007250,0.024857,0.016835,0.022873,0.014290,0.013376
9014,0.092936,0.277593,0.067824,0.096954,0.257435,0.239399,0.192993,0.362140,0.127688,0.165564,0.165822,0.177049,0.105570
16990,0.021699,0.070669,0.019897,0.016170,0.089503,0.083157,0.025825,0.063774,0.111850,0.144766,0.066968,0.145112,0.049720


In [None]:
# Transpose
#transposed_df2 = mapped_df2.transpose(include_header=True)
#transposed_df2

In [None]:
'''def unique_group(df1, grouping_attribute, df2=None):

    # create dictionary of unique instances of passed attribute in first dataframe
    unique_instances = df1[grouping_attribute].unique() 
    grouped_columns = {grouping_attribute: [] for grouping_attribute in unique_instances} 

    # fill dictionary with second dataframe's data
    if df2 is not None: 
        for col in df2.columns:
            for grouping_attribute in unique_instances:
                if grouping_attribute in col:
                    grouped_columns[grouping_attribute].append(col)
                    break
    # fill dictionary with first dataframe's data
    else:
        for col in df1.columns:
            for grouping_attribute in unique_instances:
                if grouping_attribute in col:
                    grouped_columns[grouping_attribute].append(col)
                    break
    return grouped_columns
    
def aggregate_by_group(df, grouping_attribute, grouped_columns):
    aggregated_df = pl.DataFrame()
    for grouping_attribute, cols in grouped_columns.items():
        if cols:  # Check if any columns exist for that patient
            aggregated_df[grouping_attribute] = df[cols].mean(axis=1)
    return aggregated_df'''

In [None]:
# group columns by unique instances of attribute in df1 and fill dictionary with df2 data
#grouped_columns = unique_group(df1, 'patient', df2=mapped_df2)
#aggregated_df2 = aggregate_by_group(df2, 'patient', grouped_columns)

In [None]:
'''# File 1: Read URL of the compressed CSV 
df1 = mu.read_url(url)
mu.save_to_csv(df1, input_dir, "demographic_clinical_data.csv")
matrix1 = df1.values

# File 2: Read rds from filepath in 'data' directory
rds_path = os.path.join(input_dir, rds) # create path to download existing rds file
matrix2, cell, gene = my_utils.read_rds(rds_path)
df2 = pd.DataFrame(matrix2, columns=cell, index = gene)
#my_utils.save_to_csv(df2, input_dir, "raw_molecular_data.csv") #save to 'data' directory
'''