In [1]:
import pandas as pd
import os

### 1 - Load and concatenate all the CSV files
The empty columns will be dropped

In [2]:
# Load all CSV files from `../data/interim` into a list of DataFrames
# Add a new column to each DataFrame with the name of the file
csv_path = os.path.join('..', 'data', 'interim')
dfs = []

for f in os.listdir(csv_path):
    if f.endswith('.csv'):
        df = pd.read_csv(os.path.join(csv_path, f))
        df['file'] = f
        dfs.append(df)

In [3]:
# Check the dimension of each DataFrame
dims = [df.shape for df in dfs]

for i in range(len(dims)):
    print(dims[i])


print()

# Check the columns of each DataFrame and check the differences
columns = [df.columns for df in dfs]
diffs = [list(set(columns[0]) - set(col)) for col in columns[1:]]

for i in range(len(diffs)):
    print(diffs[i])

(33551, 404)
(33908, 413)
(35105, 408)
(34473, 413)
(32233, 447)
(34718, 429)

['PROC42D', 'RACERFL', 'SERVICES', 'PROC5R', 'PROC8R', 'PROC1', 'PROC6R', 'RETINAL', 'PROC8', 'FAMPLAN', 'PROC53D', 'ERADMHOS', 'SPLINT', 'PROC82D', 'PROC4', 'PROC73D', 'TOTSERV', 'PROC3', 'PROC72D', 'PROC92D', 'PROC9R', 'PROC6', 'PROC9', 'PROC1R', 'PROC5', 'PROC93D', 'PROC7R', 'EXAM', 'FOOT', 'PROC4R', 'PROC43D', 'PROC22D', 'PROC12D', 'HIVTEST', 'PROC2', 'PROC23D', 'PROC83D', 'RACERETH', 'PROC62D', 'PROC13D', 'PROC3R', 'PROC7', 'PROC32D', 'PROC33D', 'PROC63D', 'PROC2R', 'PROC52D', 'OTHSERV', 'CAST']
['PROC42D', 'RACERFL', 'SERVICES', 'PROC5R', 'PROC8R', 'PROC1', 'PROC6R', 'RETINAL', 'ECHOCARD', 'ETHIM', 'PROC8', 'EPROLSTO', 'FAMPLAN', 'PROC53D', 'ERADMHOS', 'SPLINT', 'PROC82D', 'PROC4', 'PROC73D', 'OTHULTRA', 'TOTSERV', 'PROC3', 'PROC92D', 'PROC72D', 'PROC9R', 'PROC6', 'PROC9', 'PROC1R', 'PROC5', 'PROC93D', 'PROC7R', 'EXAM', 'FOOT', 'PROC4R', 'CATSCAN', 'PROC43D', 'PROC22D', 'PROC12D', 'PREGTEST', 'HIVTEST'

In [4]:
# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
print('The shape of the concatenated DataFrame is:')
print(df.shape)
print()

# Drop the columns with all NaN values
print('The shape of the DataFrame after dropping columns with all NaN values is:')
df = df.dropna(axis=1, how='all')
print(df.shape)

The shape of the concatenated DataFrame is:
(203988, 544)

The shape of the DataFrame after dropping columns with all NaN values is:
(203988, 532)


### 2 - Check the distribution of NaN values
To get an idea of how to clean the columns

In [5]:
# Group by the `file` column and count the number of NaN values in each group
# Drop the columns with zero NaN values
# Transpose the resulting DataFrame and add a sum column
# Sort the DataFrame by the sum column
nan_counts = df.groupby('file').apply(lambda x: x.isna().sum(), include_groups=False)
nan_counts = nan_counts.loc[:, (nan_counts != 0).any(axis=0)]
nan_counts = nan_counts.T
nan_counts['sum'] = nan_counts.sum(axis=1)
nan_counts = nan_counts.sort_values(by='sum', ascending=False)
print(nan_counts)

file      opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
COMSTAT8        33241        31864        31054        30433        31182   
CONTSUB8        33241        31864        31054        30433        31182   
PRESCR8         33241        31864        31054        30433        31182   
PRESCR7         32465        30844        30050        29361        30000   
CONTSUB7        32465        30844        30050        29361        30000   
...               ...          ...          ...          ...          ...   
URBANRUR            0            0            0            0            0   
PBAMORER            0            0            0            0            0   
CCS                 0            0            0            0            0   
HINCOMER            0            0            0            0            0   
PASTVIS          1664            0            0            0            0   

file      opd2011.csv     sum  
COMSTAT8        28841  186615  
CONTSUB8   

In [6]:
# Check the percentage of NaN values in each column in each group
nan_perc = df.groupby('file').apply(lambda x: x.isna().mean(), include_groups=False)
nan_perc = nan_perc.loc[:, (nan_perc != 0).any(axis=0)]
nan_perc = nan_perc.T
nan_perc['sum'] = nan_perc.sum(axis=1) / len(dfs)
nan_perc = nan_perc.sort_values(by='sum', ascending=False)
print(nan_perc)

file      opd2006.csv  opd2007.csv  opd2008.csv  opd2009.csv  opd2010.csv  \
PRESCR8      0.946902     0.924318     0.915831     0.907067     0.898151   
COMSTAT8     0.946902     0.924318     0.915831     0.907067     0.898151   
CONTSUB8     0.946902     0.924318     0.915831     0.907067     0.898151   
CONTSUB7     0.924797     0.894729     0.886222     0.875115     0.864105   
COMSTAT7     0.924797     0.894729     0.886222     0.875115     0.864105   
...               ...          ...          ...          ...          ...   
PBAMORER     0.000000     0.000000     0.000000     0.000000     0.000000   
URBANRUR     0.000000     0.000000     0.000000     0.000000     0.000000   
CCS          0.000000     0.000000     0.000000     0.000000     0.000000   
OPDWT        0.993306     0.000000     0.000000     0.000000     0.000000   
PASTVIS      0.047401     0.000000     0.000000     0.000000     0.000000   

file      opd2011.csv       sum  
PRESCR8      0.894766  0.914506  
COMSTAT

In [7]:
# Check the columns that have all NaN values in some file 
# but less than 50% NaN values in the rest of the files
nan_perc.drop('sum', axis=1)[(nan_perc == 1).any(axis=1)]

file,opd2006.csv,opd2007.csv,opd2008.csv,opd2009.csv,opd2010.csv,opd2011.csv
EWHOOTHO,1.0,1.0,1.0,1.0,1.0,0.0
EXCHSUM4O,1.0,1.0,1.0,1.0,1.0,0.0
EXCHSUM3O,1.0,1.0,1.0,1.0,1.0,0.0
EXCHSUM2O,1.0,1.0,1.0,1.0,1.0,0.0
EXCHSUM1O,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...
PCTPOVR,0.0,0.0,0.0,0.0,0.0,1.0
HINCOMER,0.0,0.0,0.0,0.0,0.0,1.0
PBAMORER,0.0,0.0,0.0,0.0,0.0,1.0
URBANRUR,0.0,0.0,0.0,0.0,0.0,1.0
