# Cleaning data

**Import** data from "data/raw.csv"

**Export** to "data/cleaned.csv"

In [1]:
# Setup

import numpy as np
import pandas as pd

from pkg import utils
from pkg import clean

In [2]:
# Importing data

parent = 'data'
filename = 'raw.csv'

path = utils.get_path(parent,filename)

df = pd.read_csv(path, index_col='id_estudo')

In [3]:
y = df['bayley_3_t1']
X = df.iloc[:,:len(df.columns)-5]

print(y)
X.head()

id_estudo
7      100
8       75
14      95
24     115
26     105
      ... 
554    115
555    105
556    115
558    115
560    105
Name: bayley_3_t1, Length: 313, dtype: int64


Unnamed: 0_level_0,dominance_simpson,12DICHLORETHDEG-PWY,AEROBACTINSYN-PWY,ALLANTOINDEG-PWY,CRNFORCAT-PWY,DENITRIFICATION-PWY,DHGLUCONATE-PYR-CAT-PWY,DTDPRHAMSYN-PWY,METH-ACETATE-PWY,P108-PWY,...,delivery_mode,chaos_tot_t1,epds_2c_t1,bisq_3_mins_t1,bisq_4_mins_t1,bisq_9_mins_t1,bisq_sleep_prob_t1,ebia_tot_t1,educationLevelAhmedNum_t1,a10_t1
id_estudo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.136112,0.0,0.0,7.676483,0.0,0.0,0.0,4395.178167,0.0,171.940413,...,1,4.0,0,480.0,540.0,239.0,1.0,0,16,3
8,0.451324,0.0,0.0,14.984719,0.0,320.494755,0.0,4478.753445,0.0,43.380158,...,1,1.0,0,570.0,240.0,60.0,0.0,0,20,1
14,0.281675,0.0,209.593817,0.0,0.0,0.0,0.0,2755.525229,0.0,0.0,...,1,2.0,0,720.0,90.0,30.0,0.0,0,16,1
24,0.945041,0.0,0.0,0.0,0.0,26.374059,0.0,2725.705501,0.0,50.36835,...,3,0.0,0,600.0,360.0,30.0,0.0,2,16,5
26,0.338783,0.0,0.0,0.0,0.0,0.0,0.0,725.879153,0.0,0.0,...,1,7.0,0,420.0,480.0,120.0,1.0,0,12,2


## Grouping bacterias

In [4]:
bacteria_list = [i for i in X.columns if i.startswith("k__")]
df_bacteria = X[bacteria_list]

print(df_bacteria.index)

Index([  7,   8,  14,  24,  26,  28,  29,  32,  58,  59,
       ...
       547, 548, 551, 552, 553, 554, 555, 556, 558, 560],
      dtype='int64', name='id_estudo', length=313)


In [5]:
# taking all columns with bacteria
# cutting taxonomy at index 'cutting_reference'
# generating reduced column list

cutting_reference = 4

new_bacteria_list = []

for i, specie in enumerate(bacteria_list):
    new_name_list = []
    new_name = clean.extract_first_n_groups(specie,cutting_reference)
    if new_name not in new_bacteria_list:
        new_bacteria_list.append(new_name)

print("initial number of columns            : ", len(bacteria_list))
print("cutting index reference used         : ", cutting_reference)
print("new bacteria list using aplying cut  : ", len(new_bacteria_list))

initial number of columns            :  20
cutting index reference used         :  4
new bacteria list using aplying cut  :  7


In [6]:
# Applying list to df
runned1 = False
# run just once!
if not runned1:
    df_bacteria_grouped = clean.group_columns_startswith(df_bacteria, new_bacteria_list)
    df_bacteria_grouped.sum(axis=0)
    runned2 = True

In [7]:
runned2 = False
if not runned2:
    non_bacteria_df = X.loc[:,~X.columns.str.startswith('k__')]
    runned1 = True

In [8]:
# Concatenate both df
cleaned = pd.concat([y, non_bacteria_df, df_bacteria_grouped], axis=1)
ordered_columns = cleaned.columns


# Arranging order for columns
n_bacterias = df_bacteria.shape[1]
total_columns = len(ordered_columns)

no_bac = list(ordered_columns[:total_columns - n_bacterias])
all_bac = list(ordered_columns[-n_bacterias:])

new_columns = no_bac[:92] + all_bac + no_bac[92:]

cleaned = cleaned[new_columns]

cleaned.head()

Unnamed: 0_level_0,bayley_3_t1,dominance_simpson,12DICHLORETHDEG-PWY,AEROBACTINSYN-PWY,ALLANTOINDEG-PWY,CRNFORCAT-PWY,DENITRIFICATION-PWY,DHGLUCONATE-PYR-CAT-PWY,DTDPRHAMSYN-PWY,METH-ACETATE-PWY,...,RH_temporal_theta_t1,Occipital_low_alpha_t1,RH_lateral_frontal_high_alpha_t1,LH_lateral_frontal_beta_t1,LH_parietal_beta_t1,RH_temporal_beta_t1,LH_temporal_beta_t1,b04_t1,renda_familiar_total_t0,a08_t1
id_estudo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,100,0.136112,0.0,0.0,7.676483,0.0,0.0,0.0,4395.178167,0.0,...,11.066145,6.399995,1.98456,1.480601,1.164159,2.23302,1.577486,1.0,8.517593,1.0
8,75,0.451324,0.0,0.0,14.984719,0.0,320.494755,0.0,4478.753445,0.0,...,7.316511,7.473895,1.252036,1.073858,0.973935,0.806549,1.251462,1.0,10.404323,1.0
14,95,0.281675,0.0,209.593817,0.0,0.0,0.0,0.0,2755.525229,0.0,...,10.349887,10.194401,2.383895,1.754353,1.708441,1.437741,2.593411,1.0,9.588914,1.0
24,115,0.945041,0.0,0.0,0.0,0.0,26.374059,0.0,2725.705501,0.0,...,11.525351,7.34336,2.418379,3.36727,2.326717,3.000494,4.347992,2.0,7.601402,2.0
26,105,0.338783,0.0,0.0,0.0,0.0,0.0,0.0,725.879153,0.0,...,10.511222,6.003053,1.277228,1.866852,1.555874,2.296068,1.590318,2.0,7.147559,2.0


In [9]:
# Exporting data

cleaned = cleaned

run_save = False
if not run_save:
  utils.save_df(cleaned, "cleaned.csv")
  run_save = True

csv file saved on:  c:\Users\ptons\Code\repositories\brainwise\data\cleaned.csv
