In [None]:
import gzip
import os
import re
import tarfile

import numpy as np
import pandas as pd

## Build ETL Pipeline

In [None]:
def get_by_geoid(ids):
    new_indiv = {}
    non_decimal = re.compile(r'[^\d.]+')

    for geo_id in healthy_id:
        # Ignore corrupt GEO data
        try:
            import GEOparse
            gse = GEOparse.get_GEO(geo=geo_id, destdir="./")
        except Exception as e:
            print(e)
            continue

        # Logging: Print out first individual's data
        printed = False
        
        for gsm_name, gsm in gse.gsms.items():
            indiv_data = {}
            indiv_data['name'] = gsm_name
            indiv_data['dataset'] = geo_id
            age_cols = [k for k in gsm.metadata['characteristics_ch1'] if 'age' in k.lower()]
            if ('characteristics_ch1' in gsm.metadata.keys() and 
                len(age_cols) > 0):
                
                # Ignore individual if required data is unavailable
                try:
                    if not printed:
                        print(geo_id, True, age_cols[0])
                        printed = True

                    indiv_data['age'] = age_cols[0]
                    if geo_id == 'GSE36064':
                        indiv_data['age'] = float(non_decimal.sub('', indiv_data['age'])) / 12
                    else:
                        indiv_data['age'] = float(non_decimal.sub('', indiv_data['age']))

                    for i, row in gsm.table.iterrows():
                        indiv_data[row['ID_REF']] = row['VALUE']

                    new_indiv[gsm_name] = indiv_data
                except Exception as e:
                    print(e)
                    print(indiv_data)
                    pass
            if not printed:
                try:
                    print(gsm.metadata['characteristics_ch1'])
                except:
                    print(geo_id, False)
                break
        print(len(new_indiv))

    return new_indiv

## Get Healthy Patient Data

In [None]:
healthy_ids = [
    'GSE20067',
    'GSE20236',
    'GSE20242',
    'GSE27097',
    'GSE27317',
    'GSE32149',
    'GSE34257',
    'GSE34869',
    'GSE36064',
    'GSE36642',
    'GSE37008',
    'GSE41169',
    'GSE53128',
    'GSE65638',
]

healthy_individuals = get_by_geoid(healthy_ids)

In [None]:
diabetes_ids = ['GSE20067']
diabetes_indivduals = get_by_geoid(diabetes_ids)

## Data to dataframe

In [None]:
healthy_df = pd.DataFrame(healthy_indivduals).transpose()

In [None]:
diabetes_df = pd.DataFrame(diabetes_indivduals).transpose()

In [None]:
healthy_df.shape

Drop bad data: nan columns and datasets that don't actually give us the same cpg sequences that all the other datasets give us. These columns are selected as they quickly drop bad GEO datasets from our dataset.

In [None]:
healthy_df = healthy_df.drop(healthy_df[healthy_df['cg19761273'] < 0].index)
healthy_df = healthy_df.drop(healthy_df[healthy_df['cg19761273'].isna()].index)
healthy_df = healthy_df.dropna(axis=1)

In [None]:
healthy_df.shape

We will trim rows from our diabetes dataset after filtering for our methylation sites. Methylation sites will be selected from the healthy individuals

In [None]:
diabetes_df.shape

## Export Data

In [None]:
healthy_df.to_csv('healthy_raw.csv')
diabetes_df.to_csv('diabetes_raw.csv')