# Objective
---------

The current file demostrates how to extract data from US SEC EDGAR filing reports. Post scraping, text analysis was performed to drive sentiment opinion, sentiment scores, readability, passive words, personal pronouns, etc. 

# Metrics
------
- **Positive Score**: This score is calculated by assigning the value of +1 for each word if found in the Positive Dictionary and then adding up all the values.
- **Negative Score**: This score is calculated by assigning the value of -1 for each word if found in the Negative Dictionary and then adding up all the values. We multiply the score with -1 so that the score is a positive number.
- **Polarity Score**: $\frac{(\text{Positive Score} – \text{Negative Score})}{((\text{Positive Score + Negative Score} ) + 0.000001)}$
- **Subjective Score**: $\frac{\text{(Positive Score + Negative Score)}}{\text{((Total Words after cleaning) + 0.000001)}}$
- **Average Sentece Length**: $\frac{\text{No. of Words}}{\text{Np. of Sentences}}$
- **% Complex Words**: $\frac{\text{The number of complex words}}{\text{The number of words }}$
- **Fog Index**: $0.4 * \text{(Average Sentence Length + % Complex words)}$
- **Syllable Count**: Personal Pronouns are considered for counting. Special care is taken so that the country name US is not included in the list.

## Libraries

In [1]:
import  sys, os, glob
import  re
import  numpy as np
import  pandas as pd
import  bs4 as bs
import  requests 
import  spacy # NLP
from    spacy_syllables import SpacySyllables # Syllables
from    pathlib     import Path
from    string import punctuation

### Custom NLP Model

In [2]:
nlp = spacy.load('en_core_web_sm') # Custom Model

nlp.add_pipe('syllables', after='tagger') # Model for Syllable identification

<spacy_syllables.SpacySyllables at 0x20bb351fc88>

### Constants

In [1]:
TEXT_LIMIT = 1_000_000 # Default SpaCy limit

URLArchive = "https://www.sec.gov/Archives/"

CONSTANT = 0.000_001

## Constant Factory

- Create List of the following Constants:
    - Positive Words 
    - Negative Words
    - Stop Words(_Auditiors_, _Currencies_,_Dates and Numbers_,_Generic_, _Generic Long_, _Geographic_, _Names_)
    - Constraining Words
    - Uncertainty Words

## StopWords

- Scraped Stopwords from the [Official Source](https://sraf.nd.edu/textual-analysis/resources/ )
- Saved Words in the folder _SentinemtWordList_
- Collected __positive__ and __negative__ words for creating a StopWords Corpus.
- Used Similar approach to create a corpus of __constraint__ and __uncertainty__ words.

In [3]:
path = sorted(Path('.').glob(f"**/*SentimentWordList*")) # Stop Words file Path
sentiment_file = Path.cwd()/Path(path[0])
negs = pd.read_excel(io=sentiment_file, sheet_name='Negative',header=None).iloc[:,0].values
pos = pd.read_excel(io=sentiment_file, sheet_name='Positive',header=None).iloc[:,0].values

neg_words = [word.lower() for word in negs]
pos_words = [word.lower() for word in pos]

In [4]:
def createStopWords():
    """Function to fetch Stopwords from various stop-words text files as provided in reference.
    """
    stop_words = []
    pathStopWords = sorted(Path('.').glob(f"**/stopwords*.txt")) # Stop Words file Path
    if pathStopWords:
        for filePath in pathStopWords:
            fullPath =  Path.cwd()/Path(filePath)
            with open(fullPath, mode='r', encoding='utf8', errors='ignore') as file:
                TEXT = file.read()
                fileStopWOrds = [line.split('|')[0] for line in TEXT.split('\n')]
                stop_words.extend(fileStopWOrds)
    else:
        raise FileNotFoundError(f'StopWords related files are not in the {Path.cwd()} and its subdirectories')
    return(stop_words)

In [5]:
stopWordList = createStopWords() # Collect Stop words from all files
stopWordList = list(map(lambda x: x.lower(), stopWordList)) 
nlp.Defaults.stop_words -= nlp.Defaults.stop_words # Remove Default Stop Words
nlp.Defaults.stop_words = {stopword for stopword in stopWordList if stopword !=""} # Add custom stop words 

In [6]:
# Create Constraint list
path_constraint = sorted(Path('.').glob(f"**/*constraining*")) # Stop Words file Path
constraining_file = Path.cwd()/Path(path_constraint[0])
constraints = pd.read_excel(io=constraining_file, sheet_name=0,header=0).iloc[:,0].values
constraints = [w.lower() for w in constraints]
# Create Uncertainty List
path_uncertainty = sorted(Path('.').glob(f"**/*uncertainty*")) # Stop Words file Path
uncertainty_file = Path.cwd()/Path(path_uncertainty[0])
uncertainty = pd.read_excel(io=uncertainty_file, sheet_name=0,header=0).iloc[:,0].values
uncertainty = [w.lower() for w in uncertainty]

## Helper Functions

- Performs Data cleaning, formating and NLP tasks.

In [7]:
def replaceHTMLTags(text):
    "Function that uses Regex to create word tokens."
    text = re.sub("<[^>]*>", "", text) # Remove HTML Tags
    text = re.sub(r'[^\w\s]',"", text) # Remove Punctuations
    return(text)

In [8]:
# For Data Scraping

def save_text(url):
    """Function to save the results of the scraped text in 'raw' directory."""
    file_name =  Path.cwd()/Path("raw/"+url.split('/')[-1]) # Save .txt file in raw folder
    try:
        data  = requests.get(url, timeout = 10) # Standard Timeout according to SEC.gov
        response = data.status_code
        if response >200:
            return(url)
        else:
            data = data.content.decode('utf-8')
            with open(file_name,  'w') as f:
                f.write(data)
    except:
        pass
        # raise ProxyError('Unable to connect!')

In [9]:
def read_from_txt(secfname):
    """
    Function to read from the text file related to SECFNAME.
    Params:
    ------
    secfname: str, SECFNAME column value.
    """
    text_file = secfname.split('/')[-1]
    file_path = sorted(Path('.').glob(f"**/{text_file}"))
    with open(file_path[0]) as f:
        TEXT = f.read()
    return(TEXT)

In [15]:
def get_sentiments(TEXT):
    """Function to get various count in a text.
    Params:
    ------
    TEXT: str, Input text
    Returns:
    -------
    List[count_pos_sents,count_neg_sents, total_complex_words, 
        total_words, total_sents, total_syllables, total_const, total_uncertain]:

        count_pos_sents: int, No. of Positive words in the SEC Filing
        count_neg_sents: int, No. of Negative words in the SEC Filing
        total_complex_words: int, No of Complex words in the SEC Filing
        total_words: int, Total words post cleanup in the SEC Filing
        total_sents :  int, Total sentences in the SEC Filing
        total_syllables: int, Total No of complex Syllables in the SEC Filing
        total_const: int, Total No of Constraint words
        total_uncertain: int, Total no. of Uncertain words
    """
    print("startint sentiment analysis...")
    if not TEXT:
        return([CONSTANT]*7) # To avoid ZeroDivisionError
    else:
        if len(TEXT)< TEXT_LIMIT:
            doc  = nlp(TEXT)
        else:
            nlp.max_length = len(TEXT) +1
            doc  =  nlp(TEXT, disable = ['ner'])
            print("document loaded...")
        
        count_pos_sents = 0
        count_neg_sents = 0
        total_complex_words = 0
        total_words = 0
        total_const = 0
        total_uncertain = 0
        total_sents = 0
        for token in doc:
            # Positive Word Count
            if (token.lower_ in pos_words):
                count_pos_sents += 1
            # Negative Word Count
            if (token.lower_ in neg_words):
                count_neg_sents +=1
            # Complex Word Count
            if (token._.syllables_count is not None and token._.syllables_count >2):
                total_complex_words +=1
            # Total Words
            if (token.lower_ not in nlp.Defaults.stop_words):
                total_words +=1
            # Count Constraints
            if (token.lower_ in constraints):
                total_const +=1
            # Count uncertainty
            if (token.lower_ in uncertainty):
                total_uncertain +=1
        # Total Sentences
        total_sents = sum(1 for sent in doc.sents)
        return([count_pos_sents,count_neg_sents, total_complex_words, 
        total_words, total_sents,  total_const, total_uncertain])


## Data Loading

In [17]:
data =  pd.read_excel('cik_list.xlsx', sheet_name='cik_list_ajay', header=0)

## Text Mining Pipeline
---------    
- Reads the TEXT file into the SpaCy's vectorized format.
- Removes possible HTML related tags and punctuations.
- Performs Sentiment analysis: calculates, positive, negative and polarity score
- Calculates Complex Word count
- Calculates Total Word count
- Calculates Uncertainty count
- Calculates Constraining count.
- Retuns results as pandas columns

In [18]:
text_mining_pipeline = lambda secfname: get_sentiments(replaceHTMLTags(read_from_txt(secfname)))

In [19]:
# Calculated Positivity Score, Negativity Score, Complex WOrd Count, Word Count, Sentence Length,
# Uncertainity Score and Constraining Score
data[['positive_score','negative_score','complex_word_count','word_count','sentence_length',\
    'uncertainty_score', 'constraining_score']] = data.apply(lambda x: text_mining_pipeline(x['SECFNAME']), axis=1).apply(pd.Series)

startint sentiment analysis...
document loaded...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment analysis...
startint sentiment a

In [20]:
# Calculated Polarity Score
data['polarity_score'] = data.apply(lambda row: (row['positive_score'] - row['negative_score'])/(row['positive_score'] + row['negative_score']+ CONSTANT) , axis=1)

In [21]:
# Calculated Average Sentence Length
data['average_sentence_length'] = data.apply(lambda row: row['word_count']/row['sentence_length'], axis=1)

In [32]:
# Calculated % Complex Words
data['percentage_of_complex_words'] = data.apply\
    (lambda row: (row['complex_word_count']/row['word_count'])*100, axis=1)

In [23]:
# Calculated Fog Index
data['fog_index'] = 0.4*(data['average_sentence_length'] + data['percentage_of_complex_words']) 

In [24]:
# Calculated Positive Word Proportion
data['positive_word_proportion'] = data.apply(lambda r: r['positive_score']/r['word_count'], axis=1)

In [26]:
# Calculated Negative Word Proportion
data['negative_word_proportion'] = data.apply(lambda r: r['negative_score']/r['word_count'], axis=1)

In [27]:
# Calculated Uncertainty Word Proportion
data['uncertainty_word_proportion'] = data.apply(lambda r: r['uncertainty_score']/ r['word_count'] ,axis=1)

In [28]:
# Calculated Constraining Word Proportion
data['constraining_word_proportion'] = data.apply(lambda r: r['constraining_score']/ r['word_count'] ,axis=1)

In [29]:
# Calculated Constrainig Word in whole Report
data['constraining_words_whole_report'] = np.sum(data.constraining_score) # Broadcasting the total constraining score of all the docs.

### Data Saving

In [31]:
data.to_csv(path_or_buf=Path.cwd()/Path('result.csv'),
columns=['CIK', 'CONAME', 'FYRMO', 'FDATE', 'FORM', 'SECFNAME', 'positive_score', \
    'negative_score', 'polarity_score', 'average_sentence_length', 'percentage_of_complex_words', 'fog_index',\
        'complex_word_count', 'word_count', 'uncertainty_score', 'constraining_score', 'positive_word_proportion',\
            'negative_word_proportion', 'uncertainty_word_proportion', 'constraining_word_proportion', 'constraining_words_whole_report'],
            index=False)