In [1]:
import pandas as pd
import re
from collections import defaultdict
import urllib.request
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
import os
from nltk.tokenize import sent_tokenize

## Task 1 : Creating a function to remove stopwords

<b>There are 3 ways we remove our stop words :</b>

1. Considering all the words that are not in master dictionary as stop words
2. Link to a website containing STOPWORDS files, can be used to a list of STOPWORDS
   which we will find out that, it has 12K around STOPWORDS
3. Using NLTK stopwords(it might not be a much of a preferred way because NLTK dosesn't have that many words

- <b>USING MASTER DICTIONARY</b>

In [7]:
words_to_keep = list(md['Word'])

In [2]:
master_dict_ini = pd.read_excel('LoughranMcDonald_MasterDictionary_2018.xlsx')

md = master_dict_ini.copy()

#md

md['Word'] = md['Word'].apply(lambda x: str(x).lower())

words_to_keep.sort()

In [9]:
len(words_to_keep)

86486

- <b>STOPWORDS FROM THE GIVEN WEBSITE</b>

In [13]:
#1
stop_word_file_path = '/home/pramila/Desktop/CLASSWORK/ML-ASGN-BLACKCOFFER/blackcoffer assignment/STOP-WORDS/'

list_of_files = os.listdir(stop_word_file_path)

#2
for i in list_of_files:
    
    with open(os.path.join(stop_word_file_path, i), 'r') as words :
        
        content = words.read()
        
        with open('stop_words.txt', 'a+') as stop_words :
            
            stop_words.write(content + '\n')
            
            
#3
with open('stop_words.txt', 'r') as stop_words :
            
            list_of_stop_words = stop_words.readlines()


In [15]:
#list_of_stop_words

In [16]:
#some modification req in stop_words_list

for i in range(len(list_of_stop_words)) :
    
    list_of_stop_words[i] = list_of_stop_words[i].replace('\n', "").lower()


list_of_stop_words = list(set(list_of_stop_words))

list_of_stop_words.sort()

In [25]:
#list_of_stop_words

- <b>STOPWORDS FROM NLTK MODULE</b>

In [18]:
from nltk.corpus import stopwords

In [20]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pramila/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [21]:
stop_words = set(stopwords.words('english'))

In [23]:
len(stop_words)

179

- <b>THE FUNCTION TO FILTER OUT STOPWORDS</b>

<b>MODE parameter</b> states which way you want to find stop words

- <b>MODE = 0</b> : stopwords from website
- <b>MODE = 1</b> : stopwords from nltk library
- <b>MODE = 2</b> : using master dictionary


In [24]:
def remove_stop_words(content, MODE=0):
    
    filtered_words = []
    
    if MODE == 0 :
        
        for i in content:

            if i not in list_of_stop_words:

                filtered_words.append(i)

        return filtered_words
    
    if MODE == 1 :
        
        for i in content:

            if i not in stop_words:

                filtered_words.append(i)
                
        return filtered_words
    
    if MODE == 2 :
        
        for i in content:
            
            if i in words_to_keep:
                
                filtered_words.append(i)
                
        return filtered_words

## Task 2 : Extracting the (textual data + related variables) we need, from each financial report

### 2A - preparing dataframe

In [26]:
#!conda install -c anaconda xlrd --yes

In [27]:
financial_report_ini_data = pd.read_excel('cik_list.xlsx') #original data

In [28]:
fin_rep_data = financial_report_ini_data.copy() #copying the data

In [29]:
fin_rep_data['SECFNAME'] = 'https://www.sec.gov/Archives/' + fin_rep_data['SECFNAME'] #modifying the column F

In [30]:
#fin_rep_data

### 2B - getting contraining and uncertainity words

In [43]:
# getting uncertain and constraining words 

uw_initial = pd.read_excel('uncertainty_dictionary.xlsx')
cons_initial = pd.read_excel('constraining_dictionary.xlsx')

uncertainity_words = uw_initial.copy()
constraining_words = cons_initial.copy()

#constraining_words

uncertainity_words['Word'] = uncertainity_words['Word'].apply(lambda x: str(x).lower())
constraining_words['Word'] = constraining_words['Word'].apply(lambda x: str(x).lower())

uncertainity_words = list(uncertainity_words['Word'])
constraining_words = list(constraining_words['Word'])

In [45]:
#uncertainity_words

In [44]:
#constraining_words

### 2C - modifying FDATE column's datastructure, for better use of information

In [31]:
type(fin_rep_data.loc[0, 'FDATE'])

pandas._libs.tslibs.timestamps.Timestamp

In [32]:
import datetime

In [33]:
fin_rep_data['FDATE'] = fin_rep_data['FDATE'].apply(lambda x : datetime.date(x.year,x.month,x.day))

In [34]:
type(fin_rep_data.loc[0, 'FDATE'])

datetime.date

In [35]:
#fin_rep_data.loc[0, 'FDATE'].year

### 2D - extracting section wise informations

- <b>EXTRACTING CONTENT FROM URLS</b>

In [36]:
url = defaultdict(str)

In [37]:
for i in fin_rep_data.index:
    
    url[i] = fin_rep_data.loc[i, 'SECFNAME']

In [38]:
#url

- <b>SECTIONS TO LOOK FOR</b> : Management's Discussion and Analysis, Quantitative and Qualitative Disclosures about Market Risk, Risk Factors

In [39]:
sections_patterns = [
    r'''(ITEM\s.\.\sMANAGEMENTS\sDISCUSSION\sAND\sANALYSIS\sOF\sFINANCIAL\sCONDITION\sAND\sRESULTS)(.*?)(ITEM\s\d\.\s)''', 
    r'''(ITEM\s.\.\sQUANTITATIVE\sAND\sQUALITATIVE\sDISCLOSURES\sABOUT\sMARKET\sRISK)(.*?)(ITEM\s.\.\s)''',
]

- <b>FUNCTIONS FOR MODIFICATIONS & GETTING VARIABLES' VALUES IN-HAND:</b>

1. <b>`modify_file_content`</b> is the function that remove unneccessary characters in whole file, and returns content in the form of string

In [56]:
def modify_file_content(l):
    
    #l will the list of lines in file and function returns a full content in form of single string
    
    for i in l:
        i.replace('\n', ' ')
        i.replace('\\', '')
        i.replace('\t', ' ')
        i.replace("\'", '')
    
    #new_str = ' '.join(l)
    
    #return new_str
    
    return l

2. <b>`clean`</b> is used on extracted out sections rather than whole file. It first removes more of unneccessary characters and the removes stopwords. It returns list of remaining words that are useful.

In [41]:
def clean(actual_content):
    
    # 1. UNNECCESSARY CHARACTERS REMOVAL
    
    actual_content = re.sub(r"(<.*>)",r" ",actual_content).lower()
    
    actual_content = re.sub("[^a-zA-Z]"," ",actual_content).lower()

    actual_content = re.sub(r'\s+', r' ', actual_content).lower() 
    
    # 2. REMOVING STOP WORDS
    
    actual_content = list(set(actual_content.split()))
        
    actual_content = remove_stop_words(actual_content, MODE=0)
    
    return actual_content

3. <b>`section_content_info`</b> is the function that extracts out particular sections from whole file, based on regex pattern provided and applies cleaning with `clean` function. It return number of sentences and useful words from the excerpt. It returns None if that particular section is not found in particular file.

In [47]:
def section_content_info(content_string, pattern):
    
    temp_list = re.findall(pattern, content_string, re.M)
    
    if len(temp_list) != 0 :

        actual_content = temp_list[0][1]

        number_of_sentences = len(sent_tokenize(actual_content))
        
        actual_content = clean(actual_content)
        
        # NLTK TOKENIZER

        #    IT CAN BE NOTED THAT OUR TEXT STRING IS TOKENIZED UPTILL NOW
        #    WE DON'T NEED NLTK TOKENIZER, BUT STILL, JUST IN CASE WE
        #    HAVE MISSED SOMETHING, NLTK TOKENIZER WILL HANDLE THAT !

        actual_content = ' '.join(actual_content)

        final_words = word_tokenize(actual_content)

        return [[final_words], number_of_sentences]
    
    
    else:
        
        return [None]

4. <b>`analysis_whole_report`</b> is the function that finds the number of constraining words for a given file, which needs to be added as last variable in our final output data structure.

In [46]:
def analysis_whole_report(content):
    
    new_content = clean(content)
    
    counter = 0
    
    for i in new_content :
        
        if i in constraining_words :
            
            counter += 1
            
            
    return counter

<b>THE LOOP</b>

In [48]:
c = 0 

mda = defaultdict(list)
qqdmr = defaultdict(list)
#rf = default_dict()

constraining_words_whole_report = defaultdict()

for i in url :
    
    # 1. getting the content of the url
    
    response = urllib.request.urlopen(url[i])
    content = response.read().decode('utf8')
    
    # 2. writing the url content into a file
    
#     with open('content.txt', 'w+') as f:
#         f.write(raw)

    # 3. list of lines
    
#     with open("content.txt","r") as f:
#         content = f.readlines()
        
    # 4. making the content into better form
    
    content = modify_file_content(content)
    
    # 5. Extract text for each section
    
    mda[c] = section_content_info(content, sections_patterns[0])
    
    qqdmr[c] = section_content_info(content, sections_patterns[1])
    
    #rq[c] = section_content_info(content, sections_patterns[2])
    
    constraining_words_whole_report[c] = analysis_whole_report(content)

In [60]:
url[i]

'https://www.sec.gov/Archives/edgar/data/12239/0001104659-07-062470.txt'

In [57]:
raw = modify_file_content(raw)

In [58]:
raw

'-----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc-Type: 2001,MIC-CLEAR\nOriginator-Name: webmaster@www.sec.gov\nOriginator-Key-Asymmetric:\n MFgwCgYEVQgBAQICAf8DSgAwRwJAW2sNKK9AVtBzYZmr6aGjlWyK3XmZv3dTINen\n TWSM7vrzLADbmYQaionwg5sDW3P6oaM5D3tdezXMm7z1T+B+twIDAQAB\nMIC-Info: RSA-MD5,RSA,\n GggxmDn8Rz+qXkWInHMXVJo4lq+USxFD0ibP1wbMUIIpq9zdf5JaLUyTgbZ9PcaR\n I3GlYUYthathB2T24v8kng==\n\n<SEC-DOCUMENT>0001104659-07-062470.txt : 20070814\n<SEC-HEADER>0001104659-07-062470.hdr.sgml : 20070814\n<ACCEPTANCE-DATETIME>20070814161159\nACCESSION NUMBER:\t\t0001104659-07-062470\nCONFORMED SUBMISSION TYPE:\t10-Q\nPUBLIC DOCUMENT COUNT:\t\t3\nCONFORMED PERIOD OF REPORT:\t20070630\nFILED AS OF DATE:\t\t20070814\nDATE AS OF CHANGE:\t\t20070814\n\nFILER:\n\n\tCOMPANY DATA:\t\n\t\tCOMPANY CONFORMED NAME:\t\t\tSPHERIX INC\n\t\tCENTRAL INDEX KEY:\t\t\t0000012239\n\t\tSTANDARD INDUSTRIAL CLASSIFICATION:\tSERVICES-ENGINEERING, ACCOUNTING, RESEARCH, MANAGEMENT [8700]\n\t\tIRS NUMBER:\t\t\t\t520849320\n\t\tSTATE O

- <b>OTHER SCORES</b>

In [63]:
# modifying the index of master dictionary

md = md.set_index(['Word'])

In [65]:
md.head()

Unnamed: 0_level_0,Sequence Number,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Constraining,Superfluous,Interesting,Modal,Irr_Verb,Harvard_IV,Syllables,Source
Word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
aardvark,1,277,1.480368e-08,1.239377e-08,3.56473e-06,84,0,0,0,0,0,0,0,0,0,0,2,12of12inf
aardvarks,2,3,1.603287e-10,9.72511e-12,9.863549e-09,1,0,0,0,0,0,0,0,0,0,0,2,12of12inf
abaci,3,8,4.275431e-10,1.386497e-10,6.225591e-08,7,0,0,0,0,0,0,0,0,0,0,3,12of12inf
aback,4,12,6.413147e-10,3.159061e-10,9.383557e-08,12,0,0,0,0,0,0,0,0,0,0,2,12of12inf
abacus,5,7250,3.87461e-07,3.681624e-07,3.366553e-05,914,0,0,0,0,0,0,0,0,0,0,3,12of12inf


In [66]:
#md.columns

Index(['Sequence Number', 'Word Count', 'Word Proportion',
       'Average Proportion', 'Std Dev', 'Doc Count', 'Negative', 'Positive',
       'Uncertainty', 'Litigious', 'Constraining', 'Superfluous',
       'Interesting', 'Modal', 'Irr_Verb', 'Harvard_IV', 'Syllables',
       'Source'],
      dtype='object')

In [88]:
def scores(words, number_of_sentences):
    
    negative_score = 0
    
    positive_score = 0
    
    complex_word_count = 0
    
    word_count = len(words)
    
    word_length = 0
    
    uncertainty_score = 0
    
    constraining_score = 0
    
    for word in words:
        
        word_length += len(word)
        
        if md.loc[word, 'Syllables'] >2 :
            
            complex_word_count +=1
        
        if md.loc[word, 'Positive'] != 0:
            
            positive_score += 1
        
        if md.loc[word, 'Negative'] != 0:
            
            negative_score += 1
    
        if word in uncertainity_words:
            
            uncertainty_score += 1
            
        if word in constraining_words:
            
            constraining_score += 1
            
    polarity_score = (positive_score - negative_score)/((positive_score + negative_score) + 0.000001)
    
    subjectivity_score = (positive_score + negative_score)/(len(words) + 0.000001)

    average_sentence_length = len(words)/number_of_sentences
    
    percentage_of_complex_words = complex_word_count / len(words)
    
    fog_index = (0.4)*(average_sentence_length + percentage_of_complex_words)
    
    average_word_length = word_length / len(words)
    
    positive_word_proportion = positive_score / word_count
    
    negative_word_proportion = negative_score / word_count
    
    uncertainty_word_proportion = uncertainty_score / word_count
    
    constraining_word_proportion = constraining_score / words_count
    
    return (positive_score, 
            negative_score, 
            average_sentence_length, 
            percentage_of_complex_words, 
            fog_index, 
            complex_word_count,
            word_count,
            uncertainty_score,
            constraining_score,
            positive_word_proportion,
            negative_word_proportion,
            uncertainty_word_proportion,
            constraining_word_proportion
            )

In [89]:
def final_variables():
    
    for i in mda.keys():
        (mda_positive_score[i],
        mda_negative_score[i],
        mda_polarity_score[i],
        mda_average_sentence_length[i],
        mda_percentage_of_complex_words[i],
        mda_fog_index[i],
        mda_complex_word_count[i],
        mda_word_count[i],
        mda_uncertainty_score[i],
        mda_constraining_score[i],
        mda_positive_word_proportion[i],
        mda_negative_word_proportion[i],
        mda_uncertainty_word_proportion[i],
        mda_constraining_word_proportion[i]) = scores(mda[i])
                
                

In [None]:
response = urllib.request.urlopen(url)

raw = response.read().decode('utf8')

In [None]:
# local_filename, headers = urllib.request.urlretrieve('https://www.sec.gov/Archives/edgar/data/3662/0000950170-98-000413.txt')
# html = open(local_filename).decode('utf8')

In [None]:
# sections = ["Management's Discussion and Analysis", "Quantitative and Qualitative Disclosures about Market Risk", "Risk Factors"]

# z = html.read()



In [None]:
# f= open("temp.txt","w+")

In [None]:
# f.write(z)

In [None]:
# f.close()

In [None]:
# f = open('temp.txt', 'r')

In [None]:
# f.read()

In [None]:

    
# print(f[:-1])
        
# f = open('temp_file.txt', 'w')

# f.write(z)

# k = open('temp_file.txt', 'r')

# k.readlines(1)

# k.close()

# f.close()
    
#     print(type(html))
            
#     with html as fileinput:
        
#         print(fileinput)
        
#         exit
        
#         for line in fileinput:
            
#             line = line.lower()
        
#         if '\n' in  fileinput:
            
#             print('yes')

# f.close()

# f = open('temp_file.txt' 'r')

# f.read()

# f.close()
 



In [None]:
url = "https://www.sec.gov/Archives/edgar/data/3662/0000950170-98-000413.txt"
response = urllib.request.urlopen(url)
raw = response.read().decode('utf8')

In [None]:
#regex = r"(?<=ITEM\s\d\.\sMANAGEMENT\'S\sDISCUSSION\sAND\sANALYSIS\sOF\sFINANCIAL\sCONDITION\sAND\sRESULTS\sOF\sOPERATION)(.*?)(?=ITEM\s\d\.\s)"

#test_str = ("ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATION \nkdfoks[dlsaf;\nsd\nlgsld\ns['a\n'[df\na'fd\nsaD\FSAD\nfF\n ITEM 8. dhfkjskaskdkskd")




In [None]:
#re.findall(regex, test_str)

In [None]:
l

In [None]:
for i in range(len(l)):
    l[i] = l[i].replace('\n', ' ')
    l[i] = l[i].replace('\\', '')
    l[i] = l[i].replace('\t', ' ')
    l[i] = l[i].replace("\'", '')

In [None]:
l

In [None]:
new_str = ' '.join(l)

In [None]:
new_str

In [None]:
#pattern = r'''(?<=ITEM\s\d\.\sMANAGEMENTS\sDISCUSSION\sAND\sANALYSIS\sOF\sFINANCIAL\sCONDITION\sAND\sRESULTS\sOF\sOPERATION)(.*?)(?=ITEM\s\d\.\s)'''

In [None]:
pattern = r'''(ITEM\s\d\.\sMANAGEMENTS\sDISCUSSION\sAND\sANALYSIS\sOF\sFINANCIAL\sCONDITION\sAND\sRESULTS)(.*?)(ITEM\s\d\.\s)'''

In [None]:
new_list = re.findall(pattern, new_str, re.M)

In [None]:
len(new_list)

In [None]:
final_text = new_list[0][1]

In [None]:
final_text = re.sub(r'\s+', r' ', final_text)

In [None]:
final_text

In [None]:
final_text = re.sub(r"(<.*>)",r" ",final_text).lower()

In [None]:
final_text

In [None]:
final_text = re.sub("[^a-zA-Z]"," ",final_text).lower()

final_text

final_text = re.sub(r'\s+', r' ', final_text)

In [None]:
final_text

In [None]:
list_of_stop_words = []

In [None]:
#1
stop_word_file_path = '/home/pramila/Desktop/CLASSWORK/ML-ASGN-BLACKCOFFER/blackcoffer assignment/STOP-WORDS/'

list_of_files = os.listdir(stop_word_file_path)

#2
for i in list_of_files:
    
    with open(os.path.join(stop_word_file_path, i), 'r') as words :
        
        content = words.read()
        
        with open('stop_words.txt', 'a+') as stop_words :
            
            stop_words.write(content + '\n')
            
            
#3
with open('stop_words.txt', 'r') as stop_words :
            
            list_of_stop_words = stop_words.readlines()

#list_of_stop_words

#some modification req in stop_words_list

for i in range(len(list_of_stop_words)) :
    
    list_of_stop_words[i] = list_of_stop_words[i].replace('\n', "").lower()


list_of_stop_words = list(set(list_of_stop_words))

list_of_stop_words.sort()

In [None]:
list_of_stop_words

In [None]:
if polarity_score < -0.5 :
        
        sentiment_score_categorization = 'Most Negative'
        
    elif (polarity_score < 0) and (polarity_score > -0.5):
        
        sentiment_score_categorization = 'Negative'
        
    elif polarity_score == 0:
        
        sentiment_score_categorization = 'Neutral'
        
    elif (polarity_score > 0) and (polarity_score < 0.5):
        
        sentiment_score_categorization = 'Positive'
        
    elif polarity_score > 0.5:
        
        sentiment_score_categorization = 'Very Positive'

In [None]:
# def remove_stop_words(content):
#     new_content = content
#     print(len(content))
    
#     print("\n")
#     for i in range(len(content)):
#         print(i)
        
#         if content[i] in li1:
        
#             del new_content[i]
            
#         elif content[i] in li2:
            
#             del new_content[i]
            
#         elif content[i] in li3:
            
#             del new_content[i]
            
#         elif content[i] in li4:
            
#             del new_content[i]
            
#     return new_content


    

In [None]:
stop_words_string = " ".join(list_of_stop_words)

In [None]:
stop_words_string

In [None]:
final_text = re.split(r' ', final_text)

In [None]:
final_text = set(final_text)

In [None]:
#import master dictionary

# master_dict_ini = pd.read_excel('LoughranMcDonald_MasterDictionary_2018.xlsx')

# md = master_dict_ini.copy()

In [None]:
md

In [7]:
md['Positive'].value_counts()

0       86132
2009      352
2012        1
2011        1
Name: Positive, dtype: int64

In [8]:
md['Negative'].value_counts()

0       84131
2009     2315
2014       26
2011       13
2012        1
Name: Negative, dtype: int64

In [9]:
md['Source'].value_counts()

12of12inf    81536
10K_2010      1898
2018          1265
10K_2008       871
10K_2014       462
10K_2012       339
10K_2016        90
10K_2009        14
10K_2011        11
Name: Source, dtype: int64

In [None]:

from nltk.tokenize import sent_tokenize

sentences = 'A Turning machine is a device that manipulates symbols on a strip of tape according to a table of rules. Despite its simplicity, a Turing machine can be adapted to simulate the logic of any computer algorithm, and is particularly useful in explaining the functions of a CPU.... inside a computer. The "Turing" machine was described by Alan Turing in 1936, who called it an "a(utomatic)-machine". The Turing machine is not intended as a practical computing technology, but rather as a hypothetical device representing a computing machine. Turing machines help computer scientists understand the limits of mechaniacl computation.'

number_of_sentences = sent_tokenize(sentences)

print(len(number_of_sentences))

In [10]:
md['Word'] = md['Word'].apply(lambda x: str(x).lower())

In [23]:
total_word_length = 0
total_words = md['Word'].shape[0]
all_lengths = []
for i in md['Word']:
    total_word_length += len(i)
    all_lengths.append(len(i))
    if (len(i) == 3 or len(i) == 24) :
        print(i)

abs
ace
act
add
ado
ads
adz
aft
age
ago
aha
aid
ail
aim
air
alb
ale
all
alp
amp
and
ani
ant
any
ape
apt
arc
are
ark
arm
art
ash
ask
asp
ass
ate
auk
awe
awl
awn
axe
aye
baa
bad
bag
bah
ban
bar
bat
bay
bed
bee
beg
bet
bey
bib
bid
big
bin
bio
bis
bit
biz
boa
bob
bod
bog
boo
bop
bow
box
boy
bra
bro
brr
bub
bud
bug
bum
bun
bur
bus
but
buy
bye
cab
cad
cam
can
cap
car
cat
caw
cay
chi
cob
cod
cog
con
coo
cop
cos
cot
cow
coy
cry
cub
cud
cue
cum
cup
cur
cut
dab
dad
dam
day
deb
def
den
dew
did
die
dig
dim
din
dip
dis
doc
doe
dog
doh
don
dos
dot
dry
dub
dud
due
dug
duh
dun
duo
dye
ear
eat
ebb
eds
eek
eel
egg
ego
eke
electroencephalographies
elf
elk
ell
elm
ems
emu
end
ens
eon
era
ere
erg
err
eta
eve
ewe
eye
fad
fag
fan
far
fas
fat
fax
fay
fed
fee
fen
few
fey
fez
fib
fie
fig
fin
fir
fit
fix
flu
fly
fob
foe
fog
fop
for
fox
fro
fry
fun
fur
gab
gad
gag
gal
gap
gar
gas
gay
gee
gel
gem
get
gig
gin
gnu
gob
god
goo
got
gum
gun
gut
guy
gym
gyp
had
hag
hah
ham
hap
has
hat
haw
hay
hem
hen
hep
her
hes
hew
hex

In [21]:
min(all_lengths)

2

In [15]:
total_word_length/total_words

8.786150359595773