In [14]:
!pip install pathlib2
import pandas as pd
import numpy as np
import nltk
import os
from pathlib2 import Path
import re
import shutil
import ProjectDirectory as directory


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [15]:
def clean_filing(input_filename, filing_type, output_filename):
    """
    Cleans a 10-K or 10-Q filing. All arguments take strings as input
    input_filename: name of the file to be cleaned
    filing_type: either 10-K or 10-Q
    outuput_filename: name of output file
    """
    
    # open file and get rid of all lines 
    with open (input_filename, 'r') as f:
        data = f.read().replace('\n', ' ')
    
    # get text in between the appropriate 10-K tags
    search_10k = re.search("(?s)(?m)<TYPE>{}.*?(</TEXT>)".format(filing_type), data)
    try:
        data_processed = search_10k.group(0)
    
        # delete formatting text used to identify 10-K section as its not relevant
        data_processed = re.sub(pattern="((?i)<TYPE>).*?(?=<)", repl='', string=data_processed)

        # Five more formatting tags are deleted
        data_processed = re.sub(pattern="((?i)<SEQUENCE>).*?(?=<)", repl='', string=data_processed)
        data_processed = re.sub(pattern="((?i)<FILENAME>).*?(?=<)", repl='', string=data_processed)
        data_processed = re.sub(pattern="((?i)<DESCRIPTION>).*?(?=<)", repl='', string=data_processed)
        data_processed = re.sub(pattern="(?s)(?i)<head>.*?</head>", repl='', string=data_processed)
        data_processed = re.sub(pattern="(?s)(?i)<(table).*?(</table>)", repl='', string=data_processed)

        # Tags each section of the financial statement with prefix '°Item' for future analysis
        data_processed = re.sub(pattern="(?s)(?i)(?m)> +Item|>Item|^Item", repl=">Â°Item", string=data_processed, count=0)

        # Removes all HTML tags
        data_processed = re.sub(pattern="(?s)<.*?>", repl=" ", string=data_processed, count=0)

        # Replaces all Unicode strings
        data_processed = re.sub(pattern="&(.{2,6});", repl=" ", string=data_processed, count=0)

        # Replaces multiple spaces with a single space
        data_processed = re.sub(pattern="(?s) +", repl=" ", string=data_processed, count=0)

        with open(output_filename, 'w') as output:
            output.write(data_processed)
            
    except BaseException as e:
        print('{} could not be cleaned. Exception: {}'.format(input_filename, e))
        pass

In [20]:
def clean_all_filings():
    """Clean all filings in sec-filings directory"""
    
    project_dir = directory.get_project_dir()
    company_list = os.listdir(os.path.join(project_dir, 'sec-filings-downloaded'))  

    for company in company_list:
        try:
            company_dir = os.path.join(project_dir, 'sec-filings-downloaded', company)
            os.chdir(company_dir) # abs path to each company directory
            
            print('***Cleaning: {}***'.format(company))
            for file in os.listdir():  # iterate through all files in the respective company directory
                
                # cleaning files
                if file.startswith('cleaned'): 
                    continue
                
                if file.endswith('10-K'): filing_type = '10-K'
                else: filing_type = '10-Q'
                
                if file.endswith('10-K') or file.endswith('10-Q'):
                    clean_filing(input_filename=file, filing_type=filing_type, output_filename='cleaned_' + str(file))
                    print('{} filing cleaned'.format(file))
        except:
            continue

In [23]:
def rename_10_Q_filings():
    """Rename 10Q filigns to include the quarter of the filing in the filing name"""
    
    project_dir = directory.get_project_dir()
    company_list = os.listdir(os.path.join(project_dir, 'sec-filings-downloaded'))  
    
    for company in company_list:
        try:
            company_dir = os.path.join(project_dir, 'sec-filings-downloaded', company)
            os.chdir(company_dir)
            
            print('***{}***'.format(company))
            for file in os.listdir():
                if file.startswith('cleaned_filings') or file.startswith('cleaned_Q'): 
                    continue
                    
                if file.startswith('cleaned') and file.endswith('10-Q'):
                    get_date = file[8:18]
                    get_year = file[8:12]
                    get_month = int(file[13:15])

                    if get_month >= 1 and get_month <= 5:
                        filing_quarter = 'Q1'
                    elif get_month >= 6 and get_month <= 8:
                        filing_quarter = 'Q2'
                    else:
                        filing_quarter = 'Q3'

                    os.rename(file, ('cleaned_'+str(filing_quarter)+'_'+str(get_date)+'_'+'10-Q'))
                    print('{} renamed'.format(file))
                
                else:
                    print('{} not renamed'.format(file))
        except:
            continue

In [26]:
def move_10k_10q_to_folder():
    """Move filings to the appropriate folders in each company directory"""
    
    project_dir = directory.get_project_dir()
    
    company_list = os.listdir(os.path.join(project_dir, 'sec-filings-downloaded'))  

    for company in company_list:  
        try:  
        # make directory of cleaned files
            cleaned_files_dir = os.path.join(project_dir, 'sec-filings-downloaded', company, 'cleaned_filings')
            if not os.path.exists(cleaned_files_dir): os.makedirs(cleaned_files_dir)
            
            company_dir = os.path.join(project_dir, 'sec-filings-downloaded', company)
            os.chdir(company_dir) # abs path to each company directory    
            
            print('***{}***'.format(company))
            for file in os.listdir():
                if file.startswith('cleaned_filings'): continue  # cleaned_filings directory
                if file.startswith('clean') and (file.endswith('10-Q') or file.endswith('10-K')):
                    try:
                        shutil.move(os.path.join(company_dir, file), os.path.join(cleaned_files_dir, file))
                        print('{} moved to cleaned files folder'.format(file))
                    except Exception as e:
                        os.remove(os.path.join(cleaned_files_dir, file))
                        shutil.move(os.path.join(company_dir, file), os.path.join(cleaned_files_dir, file))
                        print('{} moved to cleaned files folder'.format(file))
        except:
            continue

In [21]:
clean_all_filings()

***Cleaning: TESLA MOTORS INC***
2014-05-09_10-Q could not be cleaned. Exception: 'NoneType' object has no attribute 'group'
2014-05-09_10-Q filing cleaned
2014-11-07_10-Q could not be cleaned. Exception: 'NoneType' object has no attribute 'group'
2014-11-07_10-Q filing cleaned
2015-05-11_10-Q could not be cleaned. Exception: 'NoneType' object has no attribute 'group'
2015-05-11_10-Q filing cleaned
2015-11-05_10-Q could not be cleaned. Exception: 'NoneType' object has no attribute 'group'
2015-11-05_10-Q filing cleaned
2014-02-26_10-K could not be cleaned. Exception: 'NoneType' object has no attribute 'group'
2014-02-26_10-K filing cleaned
2016-08-05_10-Q could not be cleaned. Exception: 'NoneType' object has no attribute 'group'
2016-08-05_10-Q filing cleaned
2015-08-07_10-Q could not be cleaned. Exception: 'NoneType' object has no attribute 'group'
2015-08-07_10-Q filing cleaned
2016-05-10_10-Q could not be cleaned. Exception: 'NoneType' object has no attribute 'group'
2016-05-10_10-

In [24]:
rename_10_Q_filings()

***TESLA MOTORS INC***
2014-05-09_10-Q not renamed
2014-11-07_10-Q not renamed
2015-05-11_10-Q not renamed
2015-11-05_10-Q not renamed
2014-02-26_10-K not renamed
2016-08-05_10-Q not renamed
2015-08-07_10-Q not renamed
2016-05-10_10-Q not renamed
2015-02-26_10-K not renamed
2014-08-08_10-Q not renamed
2016-11-02_10-Q not renamed
2016-02-24_10-K not renamed
***Tesla, Inc.***
2018-11-02_10-Q not renamed
2017-05-10_10-Q not renamed
2017-08-04_10-Q not renamed
2017-11-03_10-Q not renamed
2018-05-07_10-Q not renamed
2018-02-23_10-K not renamed
2018-08-06_10-Q not renamed
2019-02-19_10-K not renamed
2017-03-01_10-K not renamed
***AMAZON COM INC***
2015-10-23_10-Q not renamed
2018-07-27_10-Q not renamed
2014-01-31_10-K not renamed
2019-02-01_10-K not renamed
2016-07-29_10-Q not renamed
2014-04-25_10-Q not renamed
2018-04-27_10-Q not renamed
2018-02-02_10-K not renamed
2016-04-29_10-Q not renamed
2014-07-25_10-Q not renamed
2016-01-29_10-K not renamed
2017-02-10_10-K not renamed
2015-01-30_10-

In [27]:
move_10k_10q_to_folder()

***TESLA MOTORS INC***
***Tesla, Inc.***
***AMAZON COM INC***
***AMERICAN EXPRESS CO***
***NETFLIX INC***
***APPLE INC***
***Facebook Inc***
***General Motors Financial Company, Inc.***
***MICROSOFT CORP***
