## Auditory N400: Data Cleaning
Elizabeth Pierotti // last updated 5.26.2020

**Goal of this notebook:** import two dfs: one contains all data, and one contains the difference wave data. For each df, remove irrelevant columns and rename these columns. Next, remove outliers with respect to MNA values for both dfs. Save dfs to processed data directory so they can be used for plotting and stats.


### Set up:

In [1]:
# import packages:
import pandas as pd
import numpy as np
from glob import glob
import os

In [2]:
# change current directory to path that contains Analysis Helper
os.chdir(os.path.dirname(os.path.abspath('/Users/elizabethpierotti/Documents/GitHub/PSC-290-Python-Final-Project/AnalysisHelper.py')))

# import .py file with functions:
import AnalysisHelper as ah

In [3]:
# set up current and output directories:

input_directory = '/Users/elizabethpierotti/Documents/GitHub/PSC-290-Python-Final-Project/pierotti/data/interim/'
output_directory = '/Users/elizabethpierotti/Documents/GitHub/PSC-290-Python-Final-Project/pierotti/data/processed/'

In [4]:
# read in interim and behavioral data:

alldatafname = 'Merged_Data.csv'
diffdatafname = 'MNA_diff_data copy.csv'

alldf = pd.read_csv(input_directory + alldatafname)
diffdf = pd.read_csv(input_directory + diffdatafname)

# preview all data df:
alldf

Unnamed: 0,Subject,Group,Condition,Electrode,MNA,Gender,Bilingual,Age_mos,TiS_mos,Age_act_mos,ROWPVT,EOWPVT,TROG,Region
0,942,CI,Related,Fz,-15.608,F,no,56,43,13.0,77.0,88.0,,Frontal
1,959,CI,Related,Fz,-8.930,F,no,57,17,40.0,79.0,91.0,,Frontal
2,925,CI,Related,Fz,-12.589,M,yes,58,52,6.0,83.0,93.0,,Frontal
3,956,CI,Related,Fz,-7.367,F,yes,58,5,53.0,66.0,61.0,,Frontal
4,955,CI,Related,Fz,-2.817,M,yes,61,49,12.0,39.0,32.0,,Frontal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,809,HG,Unrelated,Fz,-13.547,F,no,99,99,,97.0,87.0,75.0,Frontal
82,N4C09,HG,Unrelated,Fz,-11.510,M,yes,99,99,,,,,Frontal
83,850,HG,Unrelated,Fz,-22.286,F,no,124,124,,61.0,37.0,40.0,Frontal
84,801,HG,Unrelated,Fz,-24.316,M,yes,127,127,,,,50.0,Frontal


In [5]:
# preview difference df:
diffdf

Unnamed: 0,Subject,Group,Condition,Electrode,MNA,Gender,Bilingual,Age_mos,TiS_mos,Age_act_mos,ROWPVT,EOWPVT,TROG,Region,MNA_diff
0,942,CI,Unrelated,Fz,-19.676,F,no,56,43,13.0,77.0,88.0,,Frontal,-4.068
1,959,CI,Unrelated,Fz,-15.051,F,no,57,17,40.0,79.0,91.0,,Frontal,-6.121
2,925,CI,Unrelated,Fz,-15.04,M,yes,58,52,6.0,83.0,93.0,,Frontal,-2.451
3,956,CI,Unrelated,Fz,-20.795,F,yes,58,5,53.0,66.0,61.0,,Frontal,-13.428
4,955,CI,Unrelated,Fz,-2.808,M,yes,61,49,12.0,39.0,32.0,,Frontal,0.009
5,904,CI,Unrelated,Fz,-20.64,M,yes,67,54,13.0,13.0,5.0,,Frontal,-0.795
6,911,CI,Unrelated,Fz,-17.359,F,no,67,54,13.0,27.0,50.0,,Frontal,-1.122
7,919,CI,Unrelated,Fz,-15.457,F,no,74,67,7.0,75.0,75.0,,Frontal,-1.454
8,954,CI,Unrelated,Fz,-56.069,M,yes,74,61,13.0,37.0,45.0,,Frontal,-16.633
9,945,CI,Unrelated,Fz,-16.922,M,no,77,56,21.0,27.0,8.0,,Frontal,-8.067


In [6]:
# remove irrelevat columns from each df:

alldf_rel_cols = '/Users/elizabethpierotti/Documents/GitHub/PSC-290-Python-Final-Project/pierotti/data/relevant_cols.txt'
diffdf_rel_cols = '/Users/elizabethpierotti/Documents/GitHub/PSC-290-Python-Final-Project/pierotti/data/diff relevant cols.txt'

alldf = ah.filter_columns(alldf_rel_cols, alldf)
diffdf = ah.filter_columns(diffdf_rel_cols, diffdf)

print(alldf.shape, diffdf.shape)

(86, 8) (43, 9)


In [7]:
# rename columns in each df:
alldf_names = '/Users/elizabethpierotti/Documents/GitHub/PSC-290-Python-Final-Project/pierotti/data/col_names.txt'
diffdf_names = '/Users/elizabethpierotti/Documents/GitHub/PSC-290-Python-Final-Project/pierotti/data/diff_col_names.txt'

alldf = ah.rename_columns(alldf_names, alldf)
diffdf = ah.rename_columns(diffdf_names, diffdf)

print(alldf.columns, diffdf.columns)

Index(['ID', 'Group', 'Condition', 'Mean Amplitude', 'Age (months)',
       'Time in Sound (months)', 'Age of Activation (months)', 'Region'],
      dtype='object') Index(['ID', 'Group', 'Mean Amplitude', 'Bilingual', 'Age (months)',
       'Time in Sound (months)', 'Age of Activation (months)', 'Region',
       'Mean Amplitude Difference'],
      dtype='object')


In [8]:
# remove outliers based on MNA in each df:
alldf = ah.remove_outliers(alldf, 'Mean Amplitude')
diffdf = ah.remove_outliers(diffdf, 'Mean Amplitude')

print(alldf.shape, diffdf.shape)

(81, 8) (40, 9)


In [9]:
# save dfs to /processed data:

alldf.to_csv(output_directory + 'all_data_processed.csv', index=False)
diffdf.to_csv(output_directory + 'diff_data_processed.csv', index = False)