In [1]:
import json, os, collections, csv
directory_for_json_data = '../data/raw/json_data'

# Data Cleaning

## Handling Missing Values

Stock Prices

We've already pointed out data that has invalid profiles or empty locations in our geocoding jupyter notebook within /pipelines. We'll be resolving the following invalid files in /company_profile and /json_data: ['AIMAU.json', 'BRK.B.json', 'AGM.A.json', 'ALCY.json', 'CRD.A.json', 'CRD.B.json', 'AACT.json', 'GODN.json', 'WHF.json', 'CATC.json', 'MSAC.json', 'BMO.json', 'OAKU.json']


## Company Profile

In [2]:
# Traverse data in /company_profile
dir = '../../data/raw/company_profile'
assert len(os.listdir(dir)) == 1178

# List of valid files && files with missing data
file_to_missing_data = collections.defaultdict(list)  # key: file name, value: list of missing data
file_to_duplicate_data = collections.defaultdict(list)  # key: file name, value: list of duplicate data
file_to_empty_fields = collections.defaultdict(list)  # key: file name, value: list of empty fields
valid_files = [] 

# Check for missing data, duplicate data, empty fields
for filename in os.listdir(dir):
    # Required data
    req_data = { "address1", "city", "zip", "country", "industry", 
        "industryDisp", "sector", "longBusinessSummary", "maxAge", "latitude", "longitude" }

    # Open JSON file
    f = open(dir + '/' + filename, 'r')
    f = json.load(f)

    # Check for missing data
    for key in f:
        if key in req_data:
            req_data.remove(key)

    if len(req_data) > 0:
        file_to_missing_data[filename] = list(req_data)
        continue
    
    # Check for duplicate data
    seen_data = set()
    for key in f:
        if key in seen_data:
            file_to_duplicate_data[filename].append(key)
        else:
            seen_data.add(key)

    # Check if the file has empty fields
    for key in f:
        if f[key] == "":
            file_to_empty_fields[filename].append(key)
            print("Empty field in file:", filename, "for key:", key)
    
    # Valid files
    if len(req_data) == 0 and len(file_to_missing_data.get(filename, [])) == 0 and len(file_to_duplicate_data.get(filename, [])) == 0 and len(file_to_empty_fields.get(filename, [])) == 0:
        valid_files.append(filename)

print("Number of valid files:", len(valid_files))
print("Number of files with missing data:", len(file_to_missing_data))
print("Number of files with duplicate data:", len(file_to_duplicate_data))
print("Number of files with empty fields:", len(file_to_empty_fields))

print(file_to_missing_data)


Number of valid files: 1141
Number of files with missing data: 37
Number of files with duplicate data: 0
Number of files with empty fields: 0
defaultdict(<class 'list'>, {'AIMAU.json': ['sector', 'latitude', 'city', 'longBusinessSummary', 'country', 'longitude', 'address1', 'zip', 'industryDisp', 'industry', 'maxAge'], 'GBRG.json': ['zip'], 'MEGL.json': ['zip'], 'AVAL.json': ['zip'], 'BTWN.json': ['zip'], 'MLAC.json': ['zip'], 'LU.json': ['zip'], 'BMA.json': ['zip'], 'BRK.B.json': ['sector', 'latitude', 'city', 'longBusinessSummary', 'country', 'longitude', 'address1', 'zip', 'industryDisp', 'industry', 'maxAge'], 'AGM.A.json': ['sector', 'latitude', 'city', 'longBusinessSummary', 'country', 'longitude', 'address1', 'zip', 'industryDisp', 'industry', 'maxAge'], 'BSAQ.json': ['zip'], 'ALCY.json': ['sector', 'latitude', 'city', 'longBusinessSummary', 'country', 'longitude', 'address1', 'zip', 'industryDisp', 'industry', 'maxAge'], 'BSAC.json': ['zip'], 'BLX.json': ['zip'], 'CRD.A.json': 

## Financial Statements

## Stock_Prices

In [3]:
# Traverse data in /company_profile
dir = '../../data/raw/stock_prices'
assert len(os.listdir(dir)) == 1178

# List of valid files && files with missing data
file_to_missing_data = collections.defaultdict(list)  # key: file name, value: list of missing data
file_to_duplicate_data = collections.defaultdict(list)  # key: file name, value: list of duplicate data
file_to_empty_fields = collections.defaultdict(list)  # key: file name, value: list of empty fields
valid_files = []

# Check for missing data, duplicate data, empty fields
for filename in os.listdir(dir):
    # Required data
    req_data = {"symbol", "date", "open", "high", "low", "close", "volume", "adjclose"}

    # Open CSV file
    with open(os.path.join(dir, filename), 'r') as file:
        csv_reader = csv.DictReader(file)

        try:
            row = next(csv_reader)
        except StopIteration:
            file_to_missing_data[filename] = list(req_data)
            continue

        # Check for missing data
        for key in row:
            if key in req_data:
                req_data.remove(key)

        if len(req_data) > 0:
            file_to_missing_data[filename] = list(req_data)
            continue

        # Check for duplicate data
        seen_data = set()
        for key in row:
            if key in seen_data:
                file_to_duplicate_data[filename].append(key)
            else:
                seen_data.add(key)

        # Check if the file has empty fields
        for key in row:
            if row[key] == "":
                file_to_empty_fields[filename].append(key)
                print("Empty field in file:", filename, "for key:", key)
        # Valid files
        if len(req_data) == 0 and len(file_to_missing_data.get(filename, [])) == 0 and len(
                file_to_duplicate_data.get(filename, [])) == 0 and len(file_to_empty_fields.get(filename, [])) == 0:
            valid_files.append(filename)

print("Number of valid files:", len(valid_files))
print("Number of files with missing data:", len(file_to_missing_data))
print("Number of files with duplicate data:", len(file_to_duplicate_data))
print("Number of files with empty fields:", len(file_to_empty_fields))

print(file_to_missing_data)

Number of valid files: 1096
Number of files with missing data: 82
Number of files with duplicate data: 0
Number of files with empty fields: 0
defaultdict(<class 'list'>, {'ALCY.csv': ['volume', 'date', 'adjclose', 'low', 'open', 'close', 'high', 'symbol'], 'BRK.B.csv': ['volume', 'date', 'adjclose', 'low', 'open', 'close', 'high', 'symbol'], 'FORL.csv': ['volume', 'date', 'adjclose', 'low', 'open', 'close', 'high', 'symbol'], 'RWOD.csv': ['volume', 'date', 'adjclose', 'low', 'open', 'close', 'high', 'symbol'], 'GLST.csv': ['volume', 'date', 'adjclose', 'low', 'open', 'close', 'high', 'symbol'], 'MSAC.csv': ['volume', 'date', 'adjclose', 'low', 'open', 'close', 'high', 'symbol'], 'VMCA.csv': ['volume', 'date', 'adjclose', 'low', 'open', 'close', 'high', 'symbol'], 'PTHR.csv': ['volume', 'date', 'adjclose', 'low', 'open', 'close', 'high', 'symbol'], 'MEGL.csv': ['volume', 'date', 'adjclose', 'low', 'open', 'close', 'high', 'symbol'], 'TENK.csv': ['volume', 'date', 'adjclose', 'low', 'ope