In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import os
import types
from collections import defaultdict

In [3]:
# show files
path = 'data/parsed_reddit/'
files = os.listdir(path)
files

['2020-06-01_2020-06-08.csv',
 '2020-06-08_2020-06-15.csv',
 '2020-06-15_2020-06-22.csv',
 '2020-06-22_2020-06-29.csv',
 '2020-06-29_2020-07-06.csv',
 '2020-07-06_2020-07-13.csv',
 '2020-07-13_2020-07-20.csv',
 '2020-07-20_2020-07-27.csv',
 '2020-07-27_2020-08-03.csv']

In [4]:
# for each column save column names, number of null values, and data type
df_dict = {}
for f in files:
    
    df = pd.read_csv(os.path.join(path, f))
    
    print('file: {}, Numer of rows: {}'.format(f, len(df)))
    
    file_name, cols, null_count, data_type = [], [], [], []
    for col in df.columns:
        
        # save column name
        cols.append(col)
            
        # null values
        null_count.append(df[col].isnull().sum())
        
        # find data type
        if any([type(i)==float or type(i)==int for i in df[col]]):
            data_type.append('Numerical')
        elif any([type(i)==bool for i in df[col]]):
            data_type.append('Boolean')
        elif any([type(i)==str for i in df[col]]):
            data_type.append('String')
            
        df_dict[f] = pd.DataFrame({'Col Name': cols, 'Null Count': null_count, 'Data Type': data_type}) 

file: 2020-06-01_2020-06-08.csv, Numer of rows: 1872
file: 2020-06-08_2020-06-15.csv, Numer of rows: 1941
file: 2020-06-15_2020-06-22.csv, Numer of rows: 2019
file: 2020-06-22_2020-06-29.csv, Numer of rows: 1919
file: 2020-06-29_2020-07-06.csv, Numer of rows: 1920
file: 2020-07-06_2020-07-13.csv, Numer of rows: 2000
file: 2020-07-13_2020-07-20.csv, Numer of rows: 1989
file: 2020-07-20_2020-07-27.csv, Numer of rows: 1872
file: 2020-07-27_2020-08-03.csv, Numer of rows: 1823


In [5]:
# filter out columns that have more than 400 missing rows, which is about 20% assuming ~1,800-2,000
df_dict_filtered = {}
for k,v in df_dict.items():
    df_dict_filtered[k] = v[v['Null Count']<400]

In [9]:
# find number of col names on each file, after filtering
for k,v in df_dict_filtered.items():
    print('file: {}, col count: {}'.format(k, len(v['Col Name'])))

file: 2020-06-01_2020-06-08.csv, col count: 56
file: 2020-06-08_2020-06-15.csv, col count: 56
file: 2020-06-15_2020-06-22.csv, col count: 56
file: 2020-06-22_2020-06-29.csv, col count: 56
file: 2020-06-29_2020-07-06.csv, col count: 56
file: 2020-07-06_2020-07-13.csv, col count: 56
file: 2020-07-13_2020-07-20.csv, col count: 56
file: 2020-07-20_2020-07-27.csv, col count: 56
file: 2020-07-27_2020-08-03.csv, col count: 56


In [10]:
# find number of null count on each file, after filtering
for k,v in df_dict_filtered.items():
    print('file: {}, sum null count: {}'.format(k, sum(v['Null Count'])))

file: 2020-06-01_2020-06-08.csv, sum null count: 468
file: 2020-06-08_2020-06-15.csv, sum null count: 2282
file: 2020-06-15_2020-06-22.csv, sum null count: 696
file: 2020-06-22_2020-06-29.csv, sum null count: 777
file: 2020-06-29_2020-07-06.csv, sum null count: 745
file: 2020-07-06_2020-07-13.csv, sum null count: 463
file: 2020-07-13_2020-07-20.csv, sum null count: 1056
file: 2020-07-20_2020-07-27.csv, sum null count: 1518
file: 2020-07-27_2020-08-03.csv, sum null count: 103


In [11]:
# display every file summary for each column name, after filtering
for k,v in df_dict_filtered.items():
    print(k, v.shape)
    print(v)
    print('\n')

2020-06-01_2020-06-08.csv (56, 3)
                   Col Name  Null Count  Data Type
0             all_awardings           0     String
1       allow_live_comments           0    Boolean
2                    author           0     String
4     author_flair_richtext          53  Numerical
6         author_flair_type          53  Numerical
7           author_fullname          53  Numerical
8      author_patreon_flair          53  Numerical
9            author_premium          53  Numerical
10                 awarders           0     String
11             can_mod_post           0    Boolean
12             contest_mode           0    Boolean
13              created_utc           0  Numerical
14                   domain           2  Numerical
15                full_link           0     String
16                 gildings           0     String
17                       id           0     String
18         is_crosspostable           0    Boolean
19                  is_meta           0    Boole