In [28]:
import os
import pandas as pd, re
from nltk.tokenize import sent_tokenize,word_tokenize

In [8]:
df = pd.read_excel('cik_list.xlsx')

In [9]:
df.head(3)

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt


In [36]:
df.columns

Index(['CIK', 'CONAME', 'FYRMO', 'FDATE', 'FORM', 'SECFNAME'], dtype='object')

In [10]:
def encode_name(secfname):
    splitted = secfname.split(sep="/")
    name = "+".join(splitted)
    return name

## Remove all non-alphanumeric characters except newline and space

with open('./input_dir/edgar+data+11860+0000011860-00-000019.txt','r', encoding="utf-8") as file:
    lines = file.readlines()
    print(len('\n'.join(lines)))
    for i in range(len(lines)):
        lines[i] = re.sub(r'[^A-Za-z0-9\n\t ]+', ' ', lines[i])
    print(len('\n'.join(lines)))
    for i in range(len(lines)):
        lines[i] = re.sub(r'[^A-Za-z\n\t ]+', ' ', lines[i])
    print(len('\n'.join(lines)))

## Preprocess

def preprocess(secfname):
    file_path = './input_dir/'+encode_name(secfname)
    with open(file_path,'r', encoding="utf-8") as file:
        data = file.read()
        ## Remove all non-aphabetic characters except newline character
        for i in range(len(lines)):
            lines[i] = re.sub(r'[^A-Za-z\n ]+', ' ', lines[i])
            lines[i] = re.sub(' +', ' ', lines[i])
        ## Remove lines with length less than 3
        lines = [x for x in lines if len(x)>=3]
        
    write_path = './input_dir/new/'+encode_name(secfname)
    with open(write_path,'w', encoding="utf-8") as w_file:
        w_file.writelines(lines)

In [178]:
df['SECFNAME'].apply(preprocess)

0      None
1      None
2      None
3      None
4      None
       ... 
147    None
148    None
149    None
150    None
151    None
Name: SECFNAME, Length: 152, dtype: object

In [11]:
from bs4 import BeautifulSoup as BS

In [12]:
with open('./StopWords_Generic.txt' ,'r') as stop_words_file:
    stopwords = stop_words_file.read()
    stopwords = stopwords.lower().split(sep='\n')

def remove_stop_words(text):    
    word_tokens = word_tokenize(text)
    filtered_data = [w for w in word_tokens if not w in stopwords]
    return filtered_data

In [13]:
with open('./pos_words.txt','r') as pw_f:
    pw=pw_f.read()
    pw_list = pw.split('\n')

def get_positive_score(filtered_text_list):
    pw_count = 0
    for word in filtered_text_list:
        if word in pw_list:
            pw_count = pw_count + 1
    return pw_count

In [14]:
with open('./neg_words.txt','r') as nw_f:
    nw = nw_f.read()
    nw_list = nw.split('\n')

def get_negative_score(filtered_text_list):
    nw_count = 0
    for word in filtered_text_list:
        if word in nw_list:
            nw_count = nw_count + 1
    return nw_count

In [18]:
def get_polarity_score(pos_score, neg_score):
    pol_score = (pos_score - neg_score) / ((pos_score + neg_score) + 0.000001)
    return pol_score

In [19]:
def get_avg_sentence_length(sent_tokens,filtered_text_list):
    sent_len,word_len = len(sent_tokens),len(filtered_text_list)
    if sent_len !=0:
        avg_sent_len = word_len/(1.0*sent_len)
    else:
        avg_sent_len = -1
    return avg_sent_len

In [20]:
## Obtained code from https://eayd.in/?p=232

def sylco(word) :

    word = word.lower()

    # exception_add are words that need extra syllables
    # exception_del are words that need less syllables

    exception_add = ['serious','crucial']
    exception_del = ['fortunately','unfortunately']

    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
    co_two = ['coapt','coed','coinci']

    pre_one = ['preach']

    syls = 0 #added syllable number
    disc = 0 #discarded syllable number

    #1) if letters < 3 : return 1
    if len(word) <= 3 :
        syls = 1
        return syls

    #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
    # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)

    if word[-2:] == "es" or word[-2:] == "ed" :
        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
            else :
                disc+=1

    #3) discard trailing "e", except where ending is "le"  

    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']

    if word[-1:] == "e" :
        if word[-2:] == "le" and word not in le_except :
            pass

        else :
            disc+=1

    #4) check if consecutive vowels exists, triplets or pairs, count them as one.

    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
    disc+=doubleAndtripple + tripple

    #5) count remaining vowels in word.
    numVowels = len(re.findall(r'[eaoui]',word))

    #6) add one if starts with "mc"
    if word[:2] == "mc" :
        syls+=1

    #7) add one if ends with "y" but is not surrouned by vowel
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1

    #8) add one if "y" is surrounded by non-vowels and is not in the last word.

    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1

    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.

    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1

    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1

    #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"

    if word[-3:] == "ian" : 
    #and (word[-4:] != "cian" or word[-4:] != "tian") :
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else :
            syls+=1

    #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:2] == "co" and word[2] in 'eaoui' :

        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
            syls+=1
        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
            pass
        else :
            syls+=1

    #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:3] == "pre" and word[3] in 'eaoui' :
        if word[:6] in pre_one :
            pass
        else :
            syls+=1

    #13) check for "-n't" and cross match with dictionary to add syllable.

    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]

    if word[-3:] == "n't" :
        if word in negative :
            syls+=1
        else :
            pass   

    #14) Handling the exceptional words.

    if word in exception_del :
        disc+=1

    if word in exception_add :
        syls+=1     

    # calculate the output
    return numVowels - disc + syls

In [21]:
def get_perc_complex_words(words):
    count = 0
    for word in words:
        if sylco(word)>=2:
            count = count + 1
    if len(words) == 0:
        perc_complex_words = 0
    else:
        perc_complex_words = (100.0*count)/(1.0*len(words))
    return count,perc_complex_words

In [22]:
def get_fog_index(avg_sent_length,perc_complex_words):
    return 0.4 * (avg_sent_length + perc_complex_words)

In [23]:
uncertain_words = list(pd.read_excel('uncertainty_dictionary.xlsx')['Word'])
uncertain_words = [item.lower() for item in uncertain_words]

def get_uncertainity_score(words):
    count = 0
    for word in words:
        if word in uncertain_words:
            count += 1
    return count

In [31]:
constrain_words = list(pd.read_excel('constraining_dictionary.xlsx')['Word'])
constrain_words = [item.lower() for item in constrain_words]
global_constrain_count,global_word_count = 0,0

def get_constrain_score(words):
    count = 0
    for word in words:
        if word in constrain_words:
            count += 1
    #global_constrain_count += count
    #global_word_count += len(words)
    return count

In [25]:
def preprocess(secfname):
    file_path = './input_dir/'+encode_name(secfname)
    with open(file_path,'r', encoding="utf-8") as file:
        data = file.read()
    soup = BS(data,'lxml')
    ## Convert soup text to lowercase
    data = soup.text.lower()
    ## Remove all non-alpha characters except newline and full-stop(end of sentence character)
    data = re.sub(r'[^A-Za-z\n\. ]+', ' ', data)
    ## Replace mulitple spaces with single space
    data = re.sub(' +', ' ', data)
    ## Replace blank lines
    data = re.sub(r'\n+', '\n', data)
    ## tokenize sentences
    sent_tokens = sent_tokenize(data)
    ## Remove full-stop(end of sentence)
    data = re.sub(r'[^A-Za-z\n ]+', ' ', data)
    ## Remove stopwords
    filtered_text_list = remove_stop_words(data)
    ## Get positive_score
    pos_score = get_positive_score(filtered_text_list)
    ## Get negative score
    neg_score = get_negative_score(filtered_text_list)
    ## Get polarity score
    pol_score = get_polarity_score(pos_score, neg_score)
    ## Get average sentences length
    avg_sent_length = get_avg_sentence_length(sent_tokens,filtered_text_list)
    ## Get percentage of complex words
    complex_word_count,perc_complex_words = get_perc_complex_words(filtered_text_list)
    ## Get fog index
    fog_index = get_fog_index(avg_sent_length,perc_complex_words)
    ## Calculate word count
    word_count = len(filtered_text_list)
    ## Calculate uncertainity score
    uncertainity_score = get_uncertainity_score(filtered_text_list)
    ## Calculate constrain score
    constrain_score = get_constrain_score(filtered_text_list)
    ## Calculate Positive word proportion
    pos_word_proportion = pos_score/word_count
    ## Calculate Negative word proportion
    neg_word_proportion = neg_score/word_count
    ## Calculate uncertainity word proportion
    uncertain_word_proportion = uncertainity_score/word_count
    ## Calculate constraint proportion
    constraint_proportion = constrain_score/word_count
    
    result = (pos_score,neg_score,pol_score,avg_sent_length,
              perc_complex_words,fog_index,complex_word_count,
              word_count,uncertainity_score,constrain_score,
              pos_word_proportion,neg_word_proportion,neg_word_proportion,
              constraint_proportion)
    return result

write_path = './input_dir/new/'+encode_name(secfname)
with open(write_path,'w', encoding="utf-8") as w_file:
    w_file.write(data)

In [26]:
def get_whole_report_constraint_index(df):
    global_constraint_index = global_constrain_count/(1.0*global_word_count)
    df['constraint_index_for_whole_rep'] = global_constraint_index

In [29]:
df.head(10)

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt
5,3662,SUNBEAM CORP/FL/,199811,1998-11-25,10-Q/A,edgar/data/3662/0000950170-98-002278.txt
6,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002401.txt
7,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002402.txt
8,3662,SUNBEAM CORP/FL/,199903,1999-03-31,NT 10-K,edgar/data/3662/0000950172-99-000362.txt
9,3662,SUNBEAM CORP/FL/,199905,1999-05-11,10-K,edgar/data/3662/0000950170-99-000775.txt


In [32]:
new_list = []
for index, row in df.head(10).iterrows():
    secfname = row['SECFNAME']
    result = preprocess(secfname)
    new_list.append(result)

columns = ['pos_score','neg_score','pol_score','avg_sent_length',
              'perc_complex_words','fog_index','complex_word_count',
              'word_count','uncertainity_score','constrain_score',
              'pos_word_proportion','neg_word_proportion','neg_word_proportion',
              'constraint_proportion']
res_df = pd.DataFrame(new_list,columns=columns)

In [34]:
final_df = pd.concat((df.head(10),res_df),axis=1)
final_df

Unnamed: 0,CIK,CONAME,FYRMO,FDATE,FORM,SECFNAME,pos_score,neg_score,pol_score,avg_sent_length,perc_complex_words,fog_index,complex_word_count,word_count,uncertainity_score,constrain_score,pos_word_proportion,neg_word_proportion,neg_word_proportion.1,constraint_proportion
0,3662,SUNBEAM CORP/FL/,199803,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,614,3452,-0.697983,15.976875,71.027033,34.801563,66737,93960,940,1487,0.006535,0.036739,0.036739,0.015826
1,3662,SUNBEAM CORP/FL/,199805,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,341,1723,-0.669574,15.848524,69.230769,34.031717,42012,60684,859,1046,0.005619,0.028393,0.028393,0.017237
2,3662,SUNBEAM CORP/FL/,199808,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,3,7,-0.4,33.722222,57.990115,36.684935,352,607,9,5,0.004942,0.011532,0.011532,0.008237
3,3662,SUNBEAM CORP/FL/,199811,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt,233,1579,-0.742826,12.809294,69.978402,33.115078,33372,47689,553,716,0.004886,0.03311,0.03311,0.015014
4,3662,SUNBEAM CORP/FL/,199811,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt,4,7,-0.272727,34.952381,58.174387,37.250707,427,734,10,4,0.00545,0.009537,0.009537,0.00545
5,3662,SUNBEAM CORP/FL/,199811,1998-11-25,10-Q/A,edgar/data/3662/0000950170-98-002278.txt,40,426,-0.828326,14.535937,70.568634,34.041829,6565,9303,179,91,0.0043,0.045792,0.045792,0.009782
6,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002401.txt,97,622,-0.730181,14.467221,69.05157,33.407516,10819,15668,279,271,0.006191,0.039699,0.039699,0.017296
7,3662,SUNBEAM CORP/FL/,199812,1998-12-22,10-Q,edgar/data/3662/0000950170-98-002402.txt,51,378,-0.762238,13.457182,70.922714,33.751958,6910,9743,201,105,0.005235,0.038797,0.038797,0.010777
8,3662,SUNBEAM CORP/FL/,199903,1999-03-31,NT 10-K,edgar/data/3662/0000950172-99-000362.txt,3,5,-0.25,33.8,58.284024,36.833609,394,676,8,3,0.004438,0.007396,0.007396,0.004438
9,3662,SUNBEAM CORP/FL/,199905,1999-05-11,10-K,edgar/data/3662/0000950170-99-000775.txt,278,1779,-0.729703,12.771968,72.21425,33.994487,34218,47384,668,501,0.005867,0.037544,0.037544,0.010573
