# Objective of this assignment is to extract some sections (which are mentioned below) from SEC / EDGAR financial reports and perform text analysis to compute variables. 

In [1]:
import pandas as pd

In [2]:
file = pd.read_excel('cik_list.xlsx')
file.head()

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt


## Combining the url domain with 'secfname' column to access the financial reports in each link



In [3]:

y = 'https://www.sec.gov/Archives/'
links = [y+x for x in file['SECFNAME']] 

In [5]:
len(links)

152

In [6]:
import requests
import re

In [7]:
from bs4 import BeautifulSoup

## Creating a report list that contains all the text data from the financial reports 

## Each element of the list contains is the text from each financial report that is read form the links

In [8]:


report = []
for url in links:
    r = requests.get(url)
    data=r.text
    words=BeautifulSoup(data,'html.parser')
    report.append(words.get_text())
    
  

In [9]:
from nltk import sent_tokenize, word_tokenize

In [10]:
print(len(report))

152


## Reading the stopwords from the given url and storing it in a variable

In [12]:


with open ('StopWords_GenericLong.txt', 'r') as p:
    stop_words = p.read()
    stop_words = stop_words.split('\n')

In [14]:
len(stop_words)

571

## Reading the Master dictionary from url, which contains the year in which a positive or a negative words were added



In [15]:

MD = pd.read_excel('LoughranMcDonald_MasterDictionary_2018.xlsx')
MD.head()

Unnamed: 0,Word,Sequence Number,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Constraining,Superfluous,Interesting,Modal,Irr_Verb,Harvard_IV,Syllables,Source
0,AARDVARK,1,277,1.480368e-08,1.239377e-08,3.56473e-06,84,0,0,0,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.603287e-10,9.72511e-12,9.863549e-09,1,0,0,0,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,8,4.275431e-10,1.386497e-10,6.225591e-08,7,0,0,0,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,12,6.413147e-10,3.159061e-10,9.383557e-08,12,0,0,0,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,7250,3.87461e-07,3.681624e-07,3.366553e-05,914,0,0,0,0,0,0,0,0,0,0,3,12of12inf


## Creating a postive dictionary list contains the postive words from the Master dictionary

## Postive words with value 0 in master dictionary, depicts that the word is neutral

In [16]:

pos_dict = [x for x in MD[MD['Positive']!=0]['Word']]
len(pos_dict)

354

## Creating a negative dictionary list contains the negative words from the Master dictionary

## Negative words with value 0 in master dictionary, depicts that the word is neutral

In [17]:


neg_dict =  [x for x in MD[MD['Negative'] != 0]['Word']]

len(neg_dict)

2355

## Reading the uncertainity words from the uncertainity_dictionary excel sheet 

In [18]:


uncertainity = pd.read_excel('uncertainity_dictionary.xlsx')

In [19]:
uncertainity

Unnamed: 0,Word
0,ABEYANCE
1,ABEYANCES
2,ALMOST
3,ALTERATION
4,ALTERATIONS
...,...
292,VARY
293,VARYING
294,VOLATILE
295,VOLATILITIES


In [20]:
uncertainity_words = list(uncertainity['Word'])

## Reading the constraining words from the constraining_dictionary excel sheet 

In [21]:


constraining = pd.read_excel('constraining_dictionary.xlsx')
constraining_words = list(constraining['Word'])


In [22]:
from nltk.tokenize import word_tokenize

## Creating a user defined function to tokenize the text from the url into words

## r'[^A-Za-z]',' ' -> replacing all the substrings other than a-z or A-z with ' ' blank space



In [23]:

def tokenize(text):
    text = re.sub(r'[^A-Za-z]',' ',text.upper())
    tokenized_words = word_tokenize(text)
    return tokenized_words

## User defined function to remove all the stop words like a, the, he etc from the text



In [24]:

def remove_stopwords(words, stop_words):
    return [x for x in words if x not in stop_words]

## User defined function to count the number of positive and negative words in the text  which matches with those in the positive and negative dictionary list created form master dictionary 



In [25]:

def countfunc(store, words):
    score = 0
    for x in words:
        if(x in store):
            score = score+1
    return score


# user defined function to calculate the polarity score value of the text

In [26]:


def polarity(positive_score, negative_score):
    polarity_score = (positive_score - negative_score)/((positive_score + negative_score)+ 0.000001)
    return polarity_score


## User defined function for sentiment score categorization

In [None]:

def sentiment(score):
    if(score < -0.5):
        return 'Most Negative'
    elif(score >= -0.5 and score < 0):
        return 'Negative'
    elif(score == 0):
        return 'Neutral'
    elif(score > 0 and score < 0.5):
        return 'Positive'
    else:
        return 'Very Positive'

## User defined function to calculate the subjectivity score of the text

In [55]:


def subjectivity(positive_score, negative_score, num_words):
    return (positive_score+negative_score)/(num_words+ 0.000001)

## User defined function that return complex words with more than 2 syllables

In [28]:


def syllable_morethan2(word):
    if(len(word) > 2 and (word[-2:] == 'es' or word[-2:] == 'ed')):
        return False
    
    count =0
    vowels = ['a','e','i','o','u']
    for i in word:
        if(i.lower() in vowels):
            count = count +1
        
    if(count > 2):
        return True
    else:
        return False



## User defined function to calculate the fog index


In [56]:
    
def fog_index_cal(average_sentence_length, percentage_complexwords):
    return 0.4*(average_sentence_length + percentage_complexwords)

## Creating a column with all the 15 output variables and assigning default value as 0

In [29]:
col = ['positive_score',
      'negative_score',
      'polarity_score',
      'average_sentence_length',
      'percentage_of_complex_words',
      'fog_index',
      'complex_word_count',
      'word_count',
      'uncertainity_score',
      'constraining_score',
      'positive_word_proportion',
      'negative_word_proportion',
      'uncertainity_word_proportion',
      'constraining_word_proportion',
      'constraining_words_whole_report']

for c in col[:]:
        file[c] = 0.0



## Checking how the structure of output file we will get before appending the values of the NLP model

In [30]:
file

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,positive_score,negative_score,polarity_score,average_sentence_length,...,fog_index,complex_word_count,word_count,uncertainity_score,constraining_score,positive_word_proportion,negative_word_proportion,uncertainity_word_proportion,constraining_word_proportion,constraining_words_whole_report
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,12239,SPHERIX INC,200704,2007-04-02,10-K,edgar/data/12239/0001104659-07-024804.txt,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148,12239,SPHERIX INC,200705,2007-05-16,NT 10-Q,edgar/data/12239/0001104659-07-040463.txt,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149,12239,SPHERIX INC,200705,2007-05-18,10-Q,edgar/data/12239/0001104659-07-041441.txt,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,12239,SPHERIX INC,200705,2007-05-23,10-K/A,edgar/data/12239/0001104659-07-042333.txt,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
len(file.columns)

21

In [32]:
import re
from nltk import sent_tokenize

## Running a loop to access each elemet of the 152 links and calling all the user defined functions by passing the data

## Also appending the values returned from the user defined functions to the file

In [34]:
for i in range(len(report)):
    content = report[i]
    tokenized_words = tokenize(content)
    words = remove_stopwords(tokenized_words, stop_words)
    num_words = len(words)
    sentences = sent_tokenize(content)
    num_sentences = len(sentences)
    average_sentence_length = num_words/num_sentences
    
    positive_score = countfunc(pos_dict, words)
    negative_score = countfunc(neg_dict, words)
    
    polarity_score = polarity(positive_score, negative_score)
               
    subjectivity_score = subjectivity(positive_score, negative_score, num_words)
    
    num_complexword =0
    uncertainity_score = 0
    constraining_score = 0
    
    for word in words:
        if(syllable_morethan2(word)):
            num_complexword = num_complexword+1
                        
        if(word in uncertainity_words):
            uncertainity_score = uncertainity_score+1
                        
        if(word in constraining_words):
            constraining_score = constraining_score+1
            
    percentage_complexwords = num_complexword/num_words
                
    fog_index = fog_index_cal(average_sentence_length, percentage_complexwords)
                
                
    positive_word_proportion = positive_score/num_words
    negative_word_proportion = negative_score/num_words
    uncertainity_word_proportion = uncertainity_score/num_words
    constraining_word_proportion = constraining_score/num_words
    

    
    file.at[i,'positive_score'] = positive_score
    file.at[i,'negative_score'] = negative_score
    file.at[i,'polarity_score'] = polarity_score
    file.at[i,'average_sentence_length'] = average_sentence_length
    file.at[i,'percentage_of_complex_words'] = percentage_complexwords
    file.at[i,'fog_index'] = fog_index
    file.at[i,'complex_word_count'] = num_complexword
    file.at[i,'word_count'] = num_words
    file.at[i,'uncertainity_score'] = uncertainity_score
    file.at[i,'constraining_score'] = constraining_score
    file.at[i,'positive_word_proportion'] = positive_word_proportion
    file.at[i,'negative_word_proportion'] = negative_word_proportion
    file.at[i,'uncertainity_word_proportion'] = uncertainity_word_proportion
    file.at[i,'constraining_word_proportion'] = constraining_word_proportion
                
      
                
                
    constraining_words_whole_report = 0
    tokenized_report_words = tokenize(report[i])
    report_words = remove_stopwords(tokenized_report_words, stop_words)
    for word in report_words:
        if word in constraining_words:
            constraining_words_whole_report = 1+ constraining_words_whole_report
    
    file.at[i,'constraining_words_whole_report'] = constraining_words_whole_report

## Checking for the output for first five rows of the file with the score from the NLP model

In [38]:
file.head()

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,positive_score,negative_score,polarity_score,average_sentence_length,...,fog_index,complex_word_count,word_count,uncertainity_score,constraining_score,positive_word_proportion,negative_word_proportion,uncertainity_word_proportion,constraining_word_proportion,constraining_words_whole_report
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,6.0,10.0,-0.25,23.4,...,9.478803,139.0,468.0,4.0,5.0,0.012821,0.021368,0.008547,0.010684,5.0
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,585.0,1479.0,-0.43314,43.43149,...,17.493439,32750.0,108405.0,859.0,1046.0,0.005396,0.013643,0.007924,0.009649,1046.0
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,2.0,8.0,-0.6,44.3,...,17.829707,243.0,886.0,9.0,5.0,0.002257,0.009029,0.010158,0.005643,5.0
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt,6.0,10.0,-0.25,23.4,...,9.478803,139.0,468.0,4.0,5.0,0.012821,0.021368,0.008547,0.010684,5.0
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt,3.0,8.0,-0.454545,44.08,...,17.734359,282.0,1102.0,10.0,4.0,0.002722,0.00726,0.009074,0.00363,4.0


## Sending the file to an excel sheet with name Output1 

In [53]:
file.to_excel('Output1.xlsx')
