# Cleaning data

**Import** data from "data/raw.csv"

**Export** to "data/cleaned.csv"

In [1]:
# Setup

import numpy as np
import pandas as pd

from pkg import utils
from pkg import clean

In [2]:
# Importing data

path = utils.get_parents() + r"/data/raw.csv"

df = pd.read_csv(path, index_col=0)
df.head()

Unnamed: 0_level_0,dominance_simpson,12DICHLORETHDEG-PWY,AEROBACTINSYN-PWY,ALLANTOINDEG-PWY,CRNFORCAT-PWY,DENITRIFICATION-PWY,DHGLUCONATE-PYR-CAT-PWY,DTDPRHAMSYN-PWY,METH-ACETATE-PWY,P108-PWY,...,bisq_9_mins_t1,bisq_sleep_prob_t1,ebia_tot_t1,educationLevelAhmedNum_t1,a10_t1,bmi_pregest_t1,ibq_reg_t1,ibq_soot_t1,ibq_dura_t1,bayley_3_t1
id_estudo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.136112,0.0,0.0,7.676483,0.0,0.0,0.0,4395.178167,0.0,171.940413,...,239.0,1.0,0,16,3,32.36855,4.654762,5.857143,2.333333,100
8,0.451324,0.0,0.0,14.984719,0.0,320.494755,0.0,4478.753445,0.0,43.380158,...,60.0,0.0,0,20,1,20.79673,4.62619,4.428571,3.6,75
14,0.281675,0.0,209.593817,0.0,0.0,0.0,0.0,2755.525229,0.0,0.0,...,30.0,0.0,0,16,1,21.36752,5.872024,5.571429,4.75,95
24,0.945041,0.0,0.0,0.0,0.0,26.374059,0.0,2725.705501,0.0,50.36835,...,30.0,0.0,2,16,5,38.51406,5.684524,4.571429,5.0,115
26,0.338783,0.0,0.0,0.0,0.0,0.0,0.0,725.879153,0.0,0.0,...,120.0,1.0,0,12,2,22.94812,5.369048,6.142857,3.0,105


## Grouping bacterias

In [3]:
bacteria_list = [i for i in df.columns if i.startswith("k__")]
df_bacteria = df[bacteria_list]

print(bacteria_list)

['k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_radingae', 'k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_sp_HMSC035G02', 'k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_sp_HPA0247', 'k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_sp_oral_taxon_181', 'k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces|s__Actinomyces_urogenitalis', 'k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Varibaculum|s__Varibaculum_cambriense', 'k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium_adolescentis', 'k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifido

In [5]:
# taking all columns with bacteria
# cutting taxonomy at index 'cutting_reference'
# generating reduced column list

cutting_reference = 4

new_bacteria_list = []

for i, specie in enumerate(bacteria_list):
    new_name_list = []
    new_name = clean.extract_first_n_groups(specie,cutting_reference)
    if new_name not in new_bacteria_list:
        new_bacteria_list.append(new_name)


print("initial number of columns            : ", len(bacteria_list))
print("cutting index reference used         : ", cutting_reference)
print("new bacteria list using aplying cut  : ", len(new_bacteria_list))

initial number of columns            :  20
cutting index reference used         :  4
new bacteria list using aplying cut  :  7


In [7]:
# Applying list to df
runned2 = False
# run just once!
if not runned2:
    df_bacteria = clean.merge_columns_startswith(df_bacteria, new_bacteria_list)
    df_bacteria.sum(axis=0)
    runned2 = True

#df_bacteria[bacteria] = df_bacteria[list(df_bacteria.filter(regex=bacteria))].sum(axis=1)


In [8]:
runned1 = False
if not runned1:
    df = df.loc[:,~df.columns.str.startswith('k__')]
    runned1 = True

# Concatenate both df
cleaned = pd.concat([df,df_bacteria],axis=1)
ordered_columns = cleaned.columns

# Arranging order for columns
n_bacterias = df_bacteria.shape[1]
total_columns = len(ordered_columns)

no_bac = list(ordered_columns[:total_columns - n_bacterias])
all_bac = list(ordered_columns[-n_bacterias:])

new_columns = no_bac[:92] + all_bac + no_bac[92:]

cleaned = cleaned[new_columns]

cleaned.head()

Unnamed: 0,dominance_simpson,12DICHLORETHDEG-PWY,AEROBACTINSYN-PWY,ALLANTOINDEG-PWY,CRNFORCAT-PWY,DENITRIFICATION-PWY,DHGLUCONATE-PYR-CAT-PWY,DTDPRHAMSYN-PWY,METH-ACETATE-PWY,P108-PWY,...,bisq_9_mins_t1,bisq_sleep_prob_t1,ebia_tot_t1,educationLevelAhmedNum_t1,a10_t1,bmi_pregest_t1,ibq_reg_t1,ibq_soot_t1,ibq_dura_t1,bayley_3_t1
7,0.136112,0.0,0.0,7.676483,0.0,0.0,0.0,4395.178167,0.0,171.940413,...,239.0,1.0,0.0,16.0,3.0,32.36855,4.654762,5.857143,2.333333,100.0
8,0.451324,0.0,0.0,14.984719,0.0,320.494755,0.0,4478.753445,0.0,43.380158,...,60.0,0.0,0.0,20.0,1.0,20.79673,4.62619,4.428571,3.6,75.0
14,0.281675,0.0,209.593817,0.0,0.0,0.0,0.0,2755.525229,0.0,0.0,...,30.0,0.0,0.0,16.0,1.0,21.36752,5.872024,5.571429,4.75,95.0
24,0.945041,0.0,0.0,0.0,0.0,26.374059,0.0,2725.705501,0.0,50.36835,...,30.0,0.0,2.0,16.0,5.0,38.51406,5.684524,4.571429,5.0,115.0
26,0.338783,0.0,0.0,0.0,0.0,0.0,0.0,725.879153,0.0,0.0,...,120.0,1.0,0.0,12.0,2.0,22.94812,5.369048,6.142857,3.0,105.0


In [9]:
# Exporting data

cleaned = cleaned

run_save = False
if not run_save:
  utils.save_df(cleaned, "cleaned.csv")
  run_save = True

csv file saved on:  c:\Users\ptons\Code\repositories\brainwise\data\cleaned.csv
