In [None]:
import pandas as pd
import re
from collections import defaultdict
import urllib.request
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
import os
from nltk.tokenize import sent_tokenize

# Task 1 : Creating a function to remove stopwords

### <b>There are 3 ways we remove our stop words :</b>

1. Considering all the words that are not in master dictionary as stop words
2. Link to a website containing STOPWORDS files, can be used to a list of STOPWORDS
   which we will find out that, it has 12K around STOPWORDS
3. Using NLTK stopwords(it might not be a much of a preferred way because NLTK dosesn't have that many words

## - <b>USING MASTER DICTIONARY</b>

In [None]:
master_dict_ini = pd.read_excel('LoughranMcDonald_MasterDictionary_2018.xlsx')

md = master_dict_ini.copy()

#md

md['Word'] = md['Word'].apply(lambda x: str(x).lower())
words_to_keep = list(md['Word'])

words_to_keep.sort()

In [None]:
len(words_to_keep)

## - <b>OR: STOPWORDS FROM THE GIVEN WEBSITE</b>

In [None]:
#1
stop_word_file_path = 'STOP-WORDS/'

list_of_files = os.listdir(stop_word_file_path)

#2
for i in list_of_files:
    
    with open(os.path.join(stop_word_file_path, i), 'r') as words :
        
        content = words.read()
        
        with open('stop_words.txt', 'a+') as stop_words :
            
            stop_words.write(content + '\n')
            
            
#3
with open('stop_words.txt', 'r') as stop_words :
            
            list_of_stop_words = stop_words.readlines()


In [None]:
#list_of_stop_words

In [None]:
#some modification req in stop_words_list

for i in range(len(list_of_stop_words)) :
    
    list_of_stop_words[i] = list_of_stop_words[i].replace('\n', "").lower()


list_of_stop_words = list(set(list_of_stop_words))

list_of_stop_words.sort()

In [None]:
#list_of_stop_words

## - <b>OR: STOPWORDS FROM NLTK MODULE</b>

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
len(stop_words)

## - <b>THE FUNCTION TO FILTER OUT STOPWORDS</b>

<b>MODE parameter</b> states which way you want to find stop words

- <b>MODE = 0</b> : stopwords from website
- <b>MODE = 1</b> : stopwords from nltk library
- <b>MODE = 2</b> : using master dictionary


In [None]:
def remove_stop_words(content, MODE=0):
    
    filtered_words = []
    
    if MODE == 0 :
        
        for i in content:

            if i not in list_of_stop_words:

                filtered_words.append(i)

        return filtered_words
    
    if MODE == 1 :
        
        for i in content:

            if i not in stop_words:

                filtered_words.append(i)
                
        return filtered_words
    
    if MODE == 2 :
        
        for i in content:
            
            if i in words_to_keep:
                
                filtered_words.append(i)
                
        return filtered_words

# Task 2 : Extracting the (textual data + related variables) we need, from each financial report

## 2A - PREPARING DATAFRAME

In [None]:
#!conda install -c anaconda xlrd --yes

In [None]:
financial_report_ini_data = pd.read_excel('cik_list.xlsx') #original data

In [None]:
fin_rep_data = financial_report_ini_data.copy() #copying the data

In [None]:
fin_rep_data['SECFNAME'] = 'https://www.sec.gov/Archives/' + fin_rep_data['SECFNAME'] #modifying the column F

In [None]:
fin_rep_data.head()

## 2B - GETTING UNCERTAINITY AND CONSTRAINING WORDS

In [None]:
# getting uncertain and constraining words 

uw_initial = pd.read_excel('uncertainty_dictionary.xlsx')
cons_initial = pd.read_excel('constraining_dictionary.xlsx')

uncertainity_words = uw_initial.copy()
constraining_words = cons_initial.copy()

#constraining_words

uncertainity_words['Word'] = uncertainity_words['Word'].apply(lambda x: str(x).lower())
constraining_words['Word'] = constraining_words['Word'].apply(lambda x: str(x).lower())

uncertainity_words = list(uncertainity_words['Word'])
constraining_words = list(constraining_words['Word'])

In [None]:
uncertainity_words[:5]

In [None]:
constraining_words[:5]

### - modifying FDATE column's datastructure, for better use of information

In [None]:
type(fin_rep_data.loc[0, 'FDATE'])

In [None]:
import datetime

In [None]:
fin_rep_data['FDATE'] = fin_rep_data['FDATE'].apply(lambda x : datetime.date(x.year,x.month,x.day))

In [None]:
type(fin_rep_data.loc[0, 'FDATE'])

In [None]:
# fin_rep_data.loc[0, 'FDATE'].year

## 2D - EXTRACTING SECTION WISE INFORMATION

### - <b>EXTRACTING CONTENT FROM URLS</b>

In [None]:
url = defaultdict(str)

In [None]:
for i in fin_rep_data.index:
    
    url[i] = fin_rep_data.loc[i, 'SECFNAME']

In [None]:
#url

### - <b>SECTIONS TO LOOK FOR</b> : 
management's discussion and analysis, quantitative and qualitative disclosures about market risk, risk factors

In [None]:
def extract_sections(text):
    #qqdmr
    
    qqdmr_text = re.findall(r'(\sitem\s\d\w?\.\squantitative\sand\squalitative\sdisclosures\sabout\smarket\srisk\s)(.*)(item\s\d\w?\.)',text)
    
    if len(qqdmr_text) == 0:
        print('qqdmr case 2')
        qqdmr_text = re.findall(r'(\sitem\s\d\w?\.\squantitative\sand\squalitative\sdisclosures\sabout\smarket\srisk\s)(.*)(-----end privacy-enhanced message-----)',text)
        
        if len(qqdmr_text) == 0:
            qqdmr_text = 'None'
        
    #mda
    
    mda_text = re.findall(r'(\sitem\s\d\w?\.\smanagement\ss\sdiscussion\sand\sanalysis\s)(.*)(item\s\d\w?\.)', text)
    
    if len(mda_text) == 0:
        print('mda case 2')
        mda_text = re.findall(r'(\sitem\s\d\w?\.\smanagement\ss\sdiscussion\sand\sanalysis\s)(.*)(-----end privacy-enhanced message-----)', text)
        
        if len(mda_text) == 0: 
            mda_text = 'None'
        
    #rf
    
    rf_text = re.findall(r'(\sitem\s\d\w?\.\srisk\sfactors\s)(.*)(item\s\d\w?\.)', text)
    
    if len(rf_text) == 0:
        print('rf case 2')
        rf_text = re.findall(r'(\sitem\s\d\w?\.\srisk\sfactors\s)(.*)(-----end privacy-enhanced message-----)', text)
        
        if len(rf_text) == 0:
            rf_text = 'None'
        
    return (mda_text, qqdmr_text, rf_text)

## - <b>FUNCTIONS FOR MODIFICATIONS & GETTING VARIABLES' VALUES IN-HAND:</b>

### 1. <b>`modify_file_content`</b> 
is the function that remove unneccessary characters in whole file, and returns content in the form of string

In [None]:
def modify_file_content(l):
    
    #l will the string of content in file and function returns a full content in form of single string
    
    l = re.sub(r'<.*>|\n|\t|(&nbsp;)|(&#\d+)|;', ' ', l)
    l = re.sub(r'(\\x\d\d)|(\\x\w\d)|(\\x\d\ds)','', l)
    l = re.sub(r'(\\x92s)', '', l)
    l = re.sub('\s+', ' ', l)
    
#     for i in l:
#         i.replace('\n', ' ')
#         i.replace('\\', '')
#         i.replace('\t', ' ')
#         i.replace("\'", '')
#         i.replace('\xa0', '')
#         i.replace('\x', '')
    
    #new_str = ' '.join(l)
    
    #return new_str
    
    return l

### 2. <b>`clean`</b> 
is used on extracted out sections rather than whole file. It first removes more of unneccessary characters and then removes stopwords. It returns list of remaining words that are useful.

In [None]:
def clean(specific_section_text):
    
    actual_content = specific_section_text
    
    # 1. UNNECCESSARY CHARACTERS REMOVAL
    
    # actual_content = re.sub(r"(<.*>)",r" ",actual_content).lower()
    
    actual_content = re.sub("[^a-zA-Z]"," ",actual_content).lower()

    actual_content = re.sub(r'\s+', r' ', actual_content).lower() 
    
    # 3. TOKENIZE
    
    actual_content = word_tokenize(actual_content)
    
    # 2. REMOVING STOP WORDS
    
    # actual_content = list(set(actual_content.split()))
        
    actual_content = remove_stop_words(actual_content, MODE=0)
    
    return actual_content

### 3. <b>`section_content_info`</b> 
is the function that extracts out particular sections from whole file, based on regex pattern provided and applies cleaning with `clean` function. It return number of sentences and useful words from the excerpt. It returns None if that particular section is not found in particular file.

In [None]:
def section_content_info(content_string):
    
    #temp_list = re.findall(pattern, content_string, re.M)
    
    if len(temp_list) != 0 :

        actual_content = content_string

        number_of_sentences = len(sent_tokenize(actual_content))
        
        actual_content = clean(actual_content)
        
        # NLTK TOKENIZER

        #    IT CAN BE NOTED THAT OUR TEXT STRING IS TOKENIZED UPTILL NOW
        #    WE DON'T NEED NLTK TOKENIZER, BUT STILL, JUST IN CASE WE
        #    HAVE MISSED SOMETHING, NLTK TOKENIZER WILL HANDLE THAT !

        actual_content = ' '.join(actual_content)

        final_words = word_tokenize(actual_content)

        return [[final_words], number_of_sentences]
    
    
    else:
        
        return [None]

### 4. <b>`analysis_whole_report`</b> 
is the function that finds the number of constraining words for a given file, which needs to be added as last variable in our final output data structure.

In [None]:
def analysis_whole_report(content):
    
    new_content = clean(content)
    
    counter = 0
    
    for i in new_content :
        
        if i in constraining_words :
            
            counter += 1
            
            
    return counter

### <b>THE MAIN LOOP</b>

In [None]:
from bs4 import BeautifulSoup

In [None]:
def get_text_bs(html):
    tree = BeautifulSoup(html, 'lxml')

    body = tree.body
    if body is None:
        return None

    for tag in body.select('script'):
        tag.decompose()
    for tag in body.select('style'):
        tag.decompose()

    text = body.get_text(separator='\n')
    return text

In [None]:
# response = urllib.request.urlopen('https://www.sec.gov/Archives/edgar/data/3662/0000950170-98-002278.txt')
# content = response.read().decode('utf8')
# content = get_text_bs(content)
# content = content.lower()
# content = re.sub(r'<.*>|\n|\t|(&nbsp;)|(&#\d+)|;', ' ', content)
# content = re.sub('\s+', ' ', content)
# a,b,c = extract_sections(content)

In [None]:
file = {}
count=0
for i in url:
    count+=1
    try:
        response = urllib.request.urlopen(url[i])
        
    content = response.read().decode('utf8')
    content = get_text_bs(content)
    content = content.lower()
    content = re.sub(r'<.*>|\n|\t|(&nbsp;)|(&#\d+)|;', ' ', content)
    content = re.sub('\s+', ' ', content)
    a,b,c = extract_sections(content)
    file[count] = {'a':a, 'b':b, 'c':c,'url':url[i]}
    constraining_words_whole_report[c] = analysis_whole_report(content)

## - <b>OTHER SCORES</b>

In [None]:
# modifying the index of master dictionary

md = md.set_index(['Word'])

In [None]:
md.head()

In [None]:
#md.columns

### THE FUNCTION CALCULATING SCORES

In [None]:
def scores(words, number_of_sentences):
    
    negative_score = 0
    
    positive_score = 0
    
    complex_word_count = 0
    
    word_count = len(words)
    
    word_length = 0
    
    uncertainty_score = 0
    
    constraining_score = 0
    
    for word in words:
        
        word_length += len(word)
        
        if md.loc[word, 'Syllables'] >2 :
            
            complex_word_count +=1
        
        if md.loc[word, 'Positive'] != 0:
            
            positive_score += 1
        
        if md.loc[word, 'Negative'] != 0:
            
            negative_score += 1
    
        if word in uncertainity_words:
            
            uncertainty_score += 1
            
        if word in constraining_words:
            
            constraining_score += 1
            
    polarity_score = (positive_score - negative_score)/((positive_score + negative_score) + 0.000001)
    
    subjectivity_score = (positive_score + negative_score)/(len(words) + 0.000001)

    average_sentence_length = len(words)/number_of_sentences
    
    percentage_of_complex_words = complex_word_count / len(words)
    
    fog_index = (0.4)*(average_sentence_length + percentage_of_complex_words)
    
    average_word_length = word_length / len(words)
    
    positive_word_proportion = positive_score / word_count
    
    negative_word_proportion = negative_score / word_count
    
    uncertainty_word_proportion = uncertainty_score / word_count
    
    constraining_word_proportion = constraining_score / words_count
    
    return (positive_score, 
            negative_score, 
            average_sentence_length, 
            percentage_of_complex_words, 
            fog_index, 
            complex_word_count,
            word_count,
            uncertainty_score,
            constraining_score,
            positive_word_proportion,
            negative_word_proportion,
            uncertainty_word_proportion,
            constraining_word_proportion
            )

In [None]:
final_df = pd.read_excel('Output Data Structure.xlsx')
ref_df = pd.read_excel('cik_list.xlsx')

In [None]:
final_df[['CIK', 'CONAME', 'FYRMO', 'FDATE', 'FORM', 'SECFNAME']] = ref_df[['CIK', 'CONAME', 'FYRMO', 'FDATE', 'FORM', 'SECFNAME']]


In [None]:
final_df

In [None]:
(positive_score, 
            negative_score, 
            average_sentence_length, 
            percentage_of_complex_words, 
            fog_index, 
            complex_word_count,
            word_count,
            uncertainty_score,
            constraining_score,
            positive_word_proportion,
            negative_word_proportion,
            uncertainty_word_proportion,
            constraining_word_proportion
            )

In [None]:
for var in final_df.columns[6:] :
    exec(var + "= pd.Series()")

In [None]:
def polarity_calculation(pos_score, neg_score):
    
    return (pos_score-neg_score) / (0.000001 + (pos_score+neg_score)) 

In [None]:
for i in range(1,1+len(url)):
        #print(file[i+1])
        
        mda_content, number_of_sentences = section_content_info(file[i]['a'])
        
        mda_positive_score[i], mda_negative_score[i], mda_average_sentence_length[i],\ 
        mda_percentage_of_complex_words[i], mda_fog_index[i], mda_complex_word_count[i], mda_word_count[i],\
        mda_uncertainty_score[i], mda_constraining_score[i], mda_positive_word_proportion[i],\
        mda_negative_word_proportion[i], mda_uncertainty_word_proportion[i], mda_constraining_word_proportion[i]\
        = mda_content, number_of_sentences
        
        qqdmr_content, number_of_sentences = section_content_info(file[i]['b'])
        
        qqdmr_positive_score[i], qqdmr_negative_score[i], qqdmr_average_sentence_length[i],\ 
        qqdmr_percentage_of_complex_words[i], qqdmr_fog_index[i], qqdmr_complex_word_count[i], qqdmr_word_count[i],\
        qqdmr_uncertainty_score[i], qqdmr_constraining_score[i], qqdmr_positive_word_proportion[i],\
        qqdmr_negative_word_proportion[i], qqdmr_uncertainty_word_proportion[i], qqdmr_constraining_word_proportion[i]\
        = qqdmr_content, number_of_sentences
        
        rf_content, number_of_sentences = section_content_info(file[i]['c'])
        
        rf_positive_score[i], rf_negative_score[i], rf_average_sentence_length[i],\ 
        rf_percentage_of_complex_words[i], rf_fog_index[i], rf_complex_word_count[i], rf_word_count[i],\
        rf_uncertainty_score[i], rf_constraining_score[i], rf_positive_word_proportion[i],\
        rf_negative_word_proportion[i], rf_uncertainty_word_proportion[i], rf_constraining_word_proportion[i]\
        = rf_content, number_of_sentences
        
        mda_polarity_score[i] = polarity_calculation(mda_positive_score[i], mda_negative_scorea[i])
        qqdmr_polarity_score[i] = polarity_calculation(qqdmr_positive_score[i], qqdmr_negative_scorea[i])
        rf_polarity_score[i] = polarity_calculation(rf_positive_score[i], rf_negative_scorea[i])
        
        

# FINAL DATA STRUCTURE

In [None]:
for var in final_df.columns[6:] :
    exec("final_df['var'] = var")