In [None]:
from bs4 import BeautifulSoup

In [None]:
import pandas as pd

In [None]:
import re

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
from collections import Counter

# NCERT DATA

In [None]:
soup = BeautifulSoup(open("Glossary_ncert.html"), 'html.parser')

In [None]:
all_terms = [i.text.strip() for i in soup.find_all('span', {"class": ["ft1", "ft4"]})]

In [None]:
Counter(all_terms).most_common(10)

[('Depreciation', 2),
 ('Adam Smith', 1),
 ('Aggregate monetary resources', 1),
 ('Automatic stabilisers', 1),
 ('Autonomous change', 1),
 ('Autonomous expenditure multiplier', 1),
 ('Balance of payments', 1),
 ('Balanced budget', 1),
 ('Balanced budget multiplier', 1),
 ('Bank rate', 1)]

In [None]:
all_texts = [i.text.strip() for i in soup.find_all('p')]

In [None]:
terms_li=[]
texts_li = []
for term in all_terms:
  for text in all_texts:
    if term in text:
      terms_li.append(term)
      texts_li.append(text.replace(term,'').strip())


In [None]:
ncert_data = pd.DataFrame({'terms':terms_li,'definitions':texts_li})
ncert_data['source'] = 'ncert'
ncert_data['assigned_readibility']=1.0
ncert_data

In [None]:
ncert_data.to_csv('ncert_data.csv', index = False)

In [None]:
ncert_data.shape

(149, 4)

# Options, Futures, and Other Derivatives

In [None]:
soup = BeautifulSoup(open("Glossary Options, Futures, and Other Derivatives.html"), 'html.parser')

In [None]:
terms = [i.text.strip() for i in soup.find_all('span', {"class": ["cls_003"]})]
junk_terms_manual = ['Glossary of Terms', 'X%','X','¼', ':','%', 'K', '> K','ðK', 'þ K', 'Þ','N', 'ð','XÞ%','N', 'x', 't','t þ  t', 'i','<','n']
all_terms = [i for i in terms if i not in junk_terms_manual]

In [None]:
Counter(all_terms).most_common(10)

[('ABS', 1),
 ('ABS CDO', 1),
 ('Accrual Swap', 1),
 ('Accrued Interest', 1),
 ('Adaptive Mesh Model', 1),
 ('Agency Costs', 1),
 ('American Option', 1),
 ('Amortizing Swap', 1),
 ('Analytic Result', 1),
 ('Arbitrage', 1)]

In [None]:
terms_li = []
texts_li = []
text_str = ''
flag = 0
for i in soup.find_all('span', {"class" : ["cls_003", "cls_004"]}):
  if i['class']==['cls_003'] and  i.text.strip() in all_terms:
    if flag == 1:
      texts_li.append(text_str)
    terms_li.append(i.text.strip())
    text_str = ''
  elif i['class']==['cls_004']:
    text_str = text_str + i.text.strip() + ' '
    flag = 1

texts_li.append(text_str)

In [None]:
ofod_data = pd.DataFrame({'terms':terms_li,'definitions':texts_li})
ofod_data

Unnamed: 0,terms,definitions
0,ABS,See Asset-Backed Security.
1,ABS CDO,Instrument where tranches are created from the...
2,Accrual Swap,An interest rate swap where interest on one si...
3,Accrued Interest,The interest earned on a bond since the last c...
4,Adaptive Mesh Model,A model developed by Figlewski and Gao that gr...
...,...,...
526,Zero-Coupon Bond,A bond that provides no coupons.
527,Zero-Coupon Interest Rate,The interest rate that would be earned on a bo...
528,Zero-Coupon Yield Curve,A plot of the zero-coupon interest rate agains...
529,Zero Curve,See Zero-Coupon Yield Curve.


In [None]:
ofod_data['source'] = 'opod'
ofod_data['assigned_readibility']=0.0

In [None]:
ofod_data.to_csv('ofod_data.csv', index = False)

In [None]:
ofod_data.shape

(531, 4)

# Investopedia Data

In [None]:
inv_data = pd.read_csv('investopedia_crwaled_only_defn.csv')[['terms', 'only_defn']].dropna().rename(columns={'only_defn':'definitions'})
inv_data['definitions'] = inv_data['definitions'].apply(lambda x : str(x).strip())
inv_data

Unnamed: 0,terms,definitions
0,0x Protocol,The 0x protocol is an open protocol that enabl...
1,1%/10 Net 30,The 1%/10 net 30 calculation is a way of provi...
2,10-K,A 10-K is a comprehensive report filed annuall...
3,10-K Wrap,A 10-K wrap is a summary report of a company's...
4,10-Q SEC Form,SEC Form 10-Q is a comprehensive report of fin...
...,...,...
6255,Yield Variance,Yield variance is the difference between actua...
6256,Yo-Yo,Yo-yo is a slang term for a very volatile mark...
6257,York Antwerp Rules,The York Antwerp Rules are a set of maritime r...
6259,Yugen Kaisha (YK),Yugen kaisha (YKA) is a type of limited liabil...


In [None]:
inv_data['source'] = 'investopedia'
inv_data['assigned_readibility']=1.0
inv_data.to_csv('inv_data.csv', index = False)

# Financial Markets and Institutions

In [None]:
soup = BeautifulSoup(open("Glossary_Financial Markets and Institutions.html"), 'html.parser')

In [None]:
all_terms = [i.text.strip() for i in soup.find_all('span', {"class": ["cls_004"]})]

In [None]:
all_terms = sorted(all_terms, key=len)[::-1]

In [None]:
Counter(all_terms).most_common(10)

In [None]:
all_terms

In [None]:
len(all_terms)

513

In [None]:
text_content = open('Glossary_Financial Markets and Institutions_extracted_text.txt',mode='r').read()
text_content

In [None]:
all_terms

In [None]:
'financial crisis' in text_content

False

In [None]:
all_terms.remove('Act:')
text_cleaned = text_content.replace('\n', '').replace('(','@').replace(')','@')
for term in all_terms:
  #text_cleaned = re.sub(r'\d+'+term.replace('(','@').replace(')','@'), '$$&$$'+term.replace('(','@').replace(')','@'), text_cleaned)
  text_cleaned = re.sub(r'\d*'+term.replace('(','@').replace(')','@'), '$$&$$'+term.replace('(','@').replace(')','@'), text_cleaned)

In [None]:
text_cleaned

In [None]:
terms_li = []
texts_li = []
ext_str = ''
for i in text_cleaned.split('$$&$$'): 
  if ':' in i:
    if ext_str!='':
      print("after_append--",ext_str+i.split(':')[0].strip())
    terms_li.append(ext_str + i.split(':')[0].strip())
    texts_li.append(i.split(':')[1].strip())
    ext_str = ''
  else:
    ext_str = ext_str + i # 'bubble' is within 'asset-price bubble' both are terms thus extraz $$&$$ was appended. This is to treat those instances
    print(i)

In [None]:
fmi_data = pd.DataFrame({'terms':terms_li,'definitions':texts_li})
fmi_data

In [None]:
# Manual correction
fmi_data.iloc[463]['definitions'] = 'See Separate Trading of Registered Interest and Principal Securities (STRIPS).'
fmi_data.iloc[464]['terms'] = 'zero-lower-bound problem'

In [None]:
set(fmi_data['terms']) - set([i.replace('(','@').replace(')','@').replace(':','') for i in all_terms])

{'Financial Institutions Reform, Recovery, and Enforcement Act',
 'National Association of Securities Dealers Automated Quotation System @NASDAQ@',
 'Separate Trading of Registered Interest and Principal Securities @STRIPS@'}

In [None]:
set([i.replace('(','@').replace(')','@').replace(':','') for i in all_terms]) - set(fmi_data['terms'])

In [None]:
fmi_data[fmi_data['terms'].str.contains('financial crisis')]

Unnamed: 0,terms,definitions


In [None]:
fmi_data[fmi_data['definitions'].str.contains('financial crisis')]

Unnamed: 0,terms,definitions


In [None]:
fmi_data['source'] = 'fmi'
fmi_data['assigned_readibility']=0.0
fmi_data.to_csv('fmi_data.csv', index = False)

# Samunord'19

In [None]:
soup = BeautifulSoup(open("Glossary_samunord19.html"), 'html.parser')

In [None]:
soup

In [None]:
terms = [i.text.strip() for i in soup.find_all('span', {"class": ["cls_006"]})]
all_terms = [i.strip() for i in terms if len(i)>0 and i[0].isupper()]
all_terms

In [None]:
all_terms = sorted(all_terms, key=len)[::-1]

In [None]:
text_content = open('Glossary_samunord19_extracted_text.txt',mode='r').read()
junk_text = '''54 1 W\n nitions appear as separate entries in the glossary. For a more detailed discussion of particular terms, \nthe text will provide a useful starting point. More complete discussions are contained in Douglas Greenwald, ed., The McGraw-Hill Encyclopedia of Economics (McGraw-Hill, New York, 1994), and David W. Pearce, \nThe MIT Dictionary of Modern Economics,\n 4th ed. (Macmillan, London, 1992). For a comprehensive encyclopedia, see Steven N. Durlauf and Lawrence E. Blume, The New Palgrave Dictionary of Economics,\n 8 vols. (Macmillan, London, 2008). A reasonably accurate online dictionary by \nThe Economist is at www.economist.\ncom/research/economics/.\n '''

text_cleaned = text_content.replace(junk_text,'').replace('\n', '').replace('(','@').replace(')','@').replace('GLOSSARY OF TERMS', '')
for term in all_terms:
  #text_cleaned = re.sub(r'\d+'+term.replace('(','@').replace(')','@'), '$$&$$'+term.replace('(','@').replace(')','@'), text_cleaned)
  text_cleaned = re.sub(r''+term.replace('(','@').replace(')','@'), '$$&$$'+term.replace('(','@').replace(')','@'), text_cleaned)
text_cleaned  

'6  A     Ability-to-pay principle  @of taxa-tion@.   The principle that one’s tax burden should depend upon the ability to pay as measured by income or wealth. This principle does not specify  how much  more those who are better off should pay.     $$&$$Absolute advantage  @in international trade@.   The ability of Coun-try A to produce a commodity  ciently @i.e., with greater output per unit of input@ than Country B. Possession of such an absolute advantage does not nec-essarily mean that A can export this commodity to B successfully. Country B may still have the com-parative advantage.     $$&$$Actual, cyclical, and structural bud-get.    The  actual budget  cit or surplus is the amount recorded in a given year. This is composed of the  structural budget,  which calcu-lates what government revenues,  cits would be if the economy were operating at potential output, and the  cycli-cal budget,  which measures the effect of the business cycle on the budget.     $$&$$Adaptive expectation

In [None]:
terms_li = []
texts_li = []
ext_str = ''
for i in text_cleaned.split('$$&$$'): 
  for term in all_terms:
    cln_term = term.replace('(','@').replace(')','@')
    if cln_term in i:
      terms_li.append(cln_term.replace('@','').replace('.',''))
      texts_li.append(i.replace(cln_term,''))
      all_terms.remove(term)
      break


In [None]:
sam_data = pd.DataFrame({'terms':terms_li,'definitions':texts_li})
sam_data

In [None]:
sam_data['source'] = 'sam'
sam_data['assigned_readibility']=1.0
sam_data.to_csv('sam_data.csv', index = False)

# Zvi

In [None]:
soup = BeautifulSoup(open('Glossary_Investments by Zvi Bodie.html'), 'html.parser')
all_terms = [i.text.strip() for i in soup.find_all('span', {"class": ["cls_004"]})]

In [None]:
all_terms = sorted(all_terms, key=len)[::-1]

In [None]:
text_content = open('Glossary_Investments by Zvi Bodie_extracted_text.txt',mode='r').read()

In [None]:
all_terms

In [None]:
Counter(all_terms).most_common(10)

[('arbitrage pricing theory (APT)', 1),
 ('abnormal return', 1),
 ('ask price', 1),
 ('accounting earnings', 1),
 ('asset allocation', 1),
 ('acid test ratio', 1),
 ('at the money', 1),
 ('active management', 1),
 ('auction market', 1),
 ('average collection period', 1)]

In [None]:
len(all_terms)

543

In [None]:
desp_text = ' '.join([i.text.strip() for i in soup.find_all('span', {"class": ["cls_005"]})])
terms_in_desp =[term for term in all_terms if term in desp_text]
terms_not_in_desp =[term for term in all_terms if term not in desp_text]
len(terms_in_desp), len(terms_not_in_desp)

(121, 422)

In [None]:
terms_in_desp

In [None]:
junk_text = ''''''

text_cleaned = text_content.replace(junk_text,'').replace('(','@').replace(')','@').replace('Glossary', '').replace('04/19/17  05:45 PM', '').replace('bod77178_glo_G1-G14.indd', '').replace('G-','')
for term in all_terms:
  #text_cleaned = re.sub(r'\d+'+term.replace('(','@').replace(')','@'), '$$&$$'+term.replace('(','@').replace(')','@'), text_cleaned)
  text_cleaned = re.sub(r'\n'+term.replace('(','@').replace(')','@')+'\n', '$$&$$'+term.replace('(','@').replace(')','@'), text_cleaned)
text_cleaned = text_cleaned.replace('\n', '')
text_cleaned

' 1 $$&$$arbitrage pricing theory @APT@ An asset pricing theory that is derived from a factor model, using diversification and arbitrage arguments. The theory describes the relationship between expected return and factor exposure that follows from the absence of risk-free arbitrage opportunities.$$&$$ask price The price at which a dealer will sell a security.$$&$$asset allocation Allocating a portfolio across broad asset classes such as stocks versus bonds.$$&$$at the money The option™s exercise price and the price of the underlying asset are equal.$$&$$auction market A market where all traders in an asset meet @either physically or electronically@ at one place to buy and sell.$$&$$average collection period The ratio of accounts receivable to daily sales. Also called days™ receivables.B$$&$$backfill bias Bias in the average returns of a sample of funds induced by including past returns on funds that entered the sample only if they happened to be successful.$$&$$balance sheet An account

In [None]:
terms_li = []
texts_li = []
for i in text_cleaned.split('$$&$$'): 
  for term in all_terms:
    cln_term = term.replace('(','@').replace(')','@')
    if cln_term in i:
      jj = i.replace(cln_term,'').strip()
      if jj[0].isupper() and jj!='See':
        terms_li.append(cln_term.replace('@','').replace('.',''))
        if jj[-1].isupper(): #Removing single letters present at the start of index
          texts_li.append(jj[:-1])
        else:
          texts_li.append(jj)
          all_terms.remove(term)
        break


In [None]:
zvi_data = pd.DataFrame({'terms':terms_li,'definitions':texts_li})
zvi_data['source'] = 'zvi'
zvi_data['assigned_readibility']=0.0
zvi_data.to_csv('zvi_data.csv', index = False)

In [None]:
zvi_data

Unnamed: 0,terms,definitions,source,assigned_readibility
0,arbitrage pricing theory APT,An asset pricing theory that is derived from a...,zvi,0.0
1,ask price,The price at which a dealer will sell a security.,zvi,0.0
2,asset allocation,Allocating a portfolio across broad asset clas...,zvi,0.0
3,at the money,The option™s exercise price and the price of t...,zvi,0.0
4,auction market,A market where all traders in an asset meet @e...,zvi,0.0
...,...,...,...,...
520,yield curve,A graph of yield to maturity as a function of ...,zvi,0.0
521,yield to maturity YTM,A measure of the average rate of return that w...,zvi,0.0
522,zero-beta portfolio,The minimum-variance portfolio uncorrelated wi...,zvi,0.0
523,zero-coupon bond,A bond paying no coupons that sells at a disco...,zvi,0.0


In [None]:
zvi_data.shape

(525, 4)

# Principles of Corporate Finance
## Note only for this ssg text data extraction from PDF has been used.

In [None]:
soup = BeautifulSoup(open('Glossary_Principles of Corporate Finance.html'), 'html.parser')
all_terms =sorted([i.text.strip().lower() for i in soup.find_all('span', {"class": ["cls_005"]})])
all_terms.remove('risk)')

In [None]:
all_terms

In [None]:
len(all_terms)

177

In [None]:
text_content = open('Glossary_Principles_of_Corporate_Finance_extracted_text_ssg.txt',mode='r').read()
junk_text = '''Glossary'''
end = text_content.index('Internal rate of return on a bond')

text_cleaned = text_content[:end].replace(junk_text,'').replace('Glossary', '').replace('10/04/18  09:23 PM', '').replace('bre13901_glo_G1-G6', '').replace('G-','').replace('Final PDF to printer', '')
text_cleaned = re.sub('\n[A-Z]+\n','', text_cleaned).replace('\n','').lower()+' Internal rate of return on a bond'
text_cleaned

'adjusted present value (apv)  net present value of an asset if financed solely by equity plus the present value of any financing side effects.agency costs  costs that arise when an agent (e.g., a manager) does not act solely in the interests of the principal (e.g., the shareholder).annual percentage rate (apr)  the interest rate per period (e.g., per month) multiplied by the number of periods in a year.annuity  investment that produces a level stream of cash flows for a limited number of periods.annuity due  annuity whose payments occur at the start of each period.annuity factor  present value of $1 paid for each of t periods.apr  annual percentage rate.apt  arbitrage pricing theory.apv  adjusted present value.arbitrage  purchase of one security and simultaneous sale of another to give a risk-free profit. often used loosely to describe the taking of offsetting positions in related securities, e.g., at the time of a takeover bid.arbitrage pricing theory (apt)  model in which expected r

In [None]:
terms_li = []
texts_li = []

for i in range(len(all_terms)-1):
  start_ind = text_cleaned.index(all_terms[i])
  end_ind = text_cleaned.index(all_terms[i+1])
  des_text = text_cleaned[start_ind:end_ind].replace(all_terms[i],'')
  terms_li.append(all_terms[i])
  texts_li.append(des_text)
  text_cleaned = text_cleaned[end_ind:]
  print(all_terms[i])

In [None]:
len(terms_li), len(texts_li)

(176, 176)

In [None]:
terms_li[-1]

'working capital'

In [None]:
terms_li.append('yield to maturity')
texts_li.append(' internal rate of return on a bond')
prin_data = pd.DataFrame({'terms':terms_li,'definitions':texts_li})
prin_data['source'] = 'prin'
prin_data['assigned_readibility']=0.0
prin_data.to_csv('prin_data.csv', index = False)

# All data

In [None]:
import os
os.listdir()

['.config',
 'Glossary_Principles of Corporate Finance.html',
 'prin_data.csv',
 '.ipynb_checkpoints',
 'ncert_data.csv',
 'zvi_data.csv',
 'Glossary_Principles_of_Corporate_Finance_extracted_text_ssg.txt',
 'sam_data.csv',
 'ofod_data.csv',
 'inv_data.csv',
 'fmi_data.csv',
 'sample_data']

In [None]:
all_data = pd.DataFrame({})
for fil in os.listdir():
  if '.csv' in fil:
    all_data = all_data.append(pd.read_csv(fil))

In [None]:
all_data

Unnamed: 0,terms,definitions,source,assigned_readibility
0,adjusted present value (apv),net present value of an asset if financed so...,prin,0.0
1,agency costs,"costs that arise when an agent (e.g., a mana...",prin,0.0
2,annual percentage rate (apr),"the interest rate per period (e.g., per mont...",prin,0.0
3,annuity,investment that produces a level stream of c...,prin,0.0
4,annuity due,annuity whose payments occur at the start of...,prin,0.0
...,...,...,...,...
460,yield curve,A plot of the interest rates for par-ticular t...,fmi,0.0
461,yield to maturity,The interest rate that equates the present val...,fmi,0.0
462,zero-coupon bond,See discount bond.,fmi,0.0
463,zero-coupon securities,See Separate Trading of Registered Interest an...,fmi,0.0


In [None]:
pd.isnull(all_data['definitions']).value_counts()

False    8408
Name: definitions, dtype: int64

In [None]:
all_data[all_data['definitions']==''].value_counts()

Series([], dtype: int64)

# Extracting Readibility

In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.2-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 3.8 MB/s 
[?25hCollecting pyphen
  Downloading pyphen-0.11.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 13.9 MB/s 
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.11.0 textstat-0.7.2


In [None]:
import textstat

In [None]:
def extract_scores(test_data):
  try:
    scores = [textstat.flesch_reading_ease(test_data),textstat.flesch_kincaid_grade(test_data),textstat.smog_index(test_data),textstat.coleman_liau_index(test_data), textstat.automated_readability_index(test_data),textstat.dale_chall_readability_score(test_data), textstat.difficult_words(test_data), textstat.linsear_write_formula(test_data), textstat.gunning_fog(test_data), textstat.text_standard(test_data), textstat.fernandez_huerta(test_data), textstat.szigriszt_pazos(test_data), textstat.gutierrez_polini(test_data), textstat.crawford(test_data), textstat.gulpease_index(test_data), textstat.osman(test_data)]
  except:
    scores = [0.0]*9 + ['NOT USE'] + [0.0]*6
  return scores

In [None]:
test_data = (
    "Playing games has always been thought to be important to "
    "the development of well-balanced and creative children; "
    "however, what part, if any, they should play in the lives "
    "of adults has never been researched that deeply. I believe "
    "that playing games is every bit as important for adults "
    "as for children. Not only is taking time out to play games "
    "with our children and other adults valuable to building "
    "interpersonal relationships but is also a wonderful way "
    "to release built up tension."
)
extract_scores(test_data)

[52.23,
 12.8,
 12.5,
 11.03,
 15.5,
 7.3,
 9,
 16.333333333333332,
 12.38,
 '12th and 13th grade',
 89.0,
 87.57,
 38.79,
 3.1,
 50.707317073170735,
 48.01]

In [None]:
scores = all_data['definitions'].apply(lambda x: extract_scores(str(x)))

In [None]:
len(scores)

8408

In [None]:
scores_df = pd.DataFrame(list(scores))
scores_df.columns = ['flesch_reading_ease','flesch_kincaid_grade','smog_index','coleman_liau_index','automated_readability_index','dale_chall_readability_score','difficult_words','linsear_write_formula','gunning_fog','text_standard','fernandez_huerta','szigriszt_pazos','gutierrez_polini','crawford','gulpease_index','osman']
scores_df

Unnamed: 0,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,fernandez_huerta,szigriszt_pazos,gutierrez_polini,crawford,gulpease_index,osman
0,51.18,11.1,0.0,9.40,10.5,10.16,6.0,12.0,12.00,11th and 12th grade,90.4,87.16,43.58,3.4,57.500000,63.21
1,57.61,10.7,0.0,7.78,12.3,10.47,4.0,15.0,14.25,10th and 11th grade,94.4,91.39,45.61,3.0,54.454545,66.35
2,54.22,9.9,0.0,6.67,8.4,10.98,4.0,12.5,16.21,9th and 10th grade,93.5,87.22,48.74,3.5,61.352941,75.43
3,47.79,10.3,0.0,9.56,8.7,9.64,4.0,10.5,14.00,9th and 10th grade,89.6,83.85,44.04,3.7,61.000000,64.66
4,44.41,9.6,0.0,8.50,6.2,10.45,4.0,6.0,12.00,9th and 10th grade,88.6,84.70,46.11,3.4,71.000000,74.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8403,55.24,9.5,0.0,9.97,9.6,13.31,3.0,12.0,13.90,9th and 10th grade,94.5,89.60,43.53,3.3,59.000000,65.16
8404,51.18,11.1,0.0,12.59,13.1,9.37,5.0,12.0,12.00,11th and 12th grade,90.4,90.27,38.25,3.2,52.000000,48.70
8405,93.81,0.9,0.0,3.43,5.2,14.31,1.0,0.5,1.20,0th and 1st grade,125.8,120.77,45.65,-3.8,135.666667,84.90
8406,10.56,14.3,0.0,19.52,16.1,13.61,5.0,9.0,20.00,19th and 20th grade,64.6,59.78,27.68,5.3,50.000000,14.12


In [None]:
scores_df[scores_df['text_standard']!='NOT USE'].shape

(8401, 16)

In [None]:
all_data.shape

(8408, 4)

In [None]:
scores_df.shape

(8408, 16)

In [None]:
all_data_scores_df = pd.concat([all_data.reset_index(),scores_df.reset_index()], axis = 1)
all_data_scores_df

Unnamed: 0,index,terms,definitions,source,assigned_readibility,index.1,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,fernandez_huerta,szigriszt_pazos,gutierrez_polini,crawford,gulpease_index,osman
0,0,adjusted present value (apv),net present value of an asset if financed so...,prin,0.0,0,51.18,11.1,0.0,9.40,10.5,10.16,6.0,12.0,12.00,11th and 12th grade,90.4,87.16,43.58,3.4,57.500000,63.21
1,1,agency costs,"costs that arise when an agent (e.g., a mana...",prin,0.0,1,57.61,10.7,0.0,7.78,12.3,10.47,4.0,15.0,14.25,10th and 11th grade,94.4,91.39,45.61,3.0,54.454545,66.35
2,2,annual percentage rate (apr),"the interest rate per period (e.g., per mont...",prin,0.0,2,54.22,9.9,0.0,6.67,8.4,10.98,4.0,12.5,16.21,9th and 10th grade,93.5,87.22,48.74,3.5,61.352941,75.43
3,3,annuity,investment that produces a level stream of c...,prin,0.0,3,47.79,10.3,0.0,9.56,8.7,9.64,4.0,10.5,14.00,9th and 10th grade,89.6,83.85,44.04,3.7,61.000000,64.66
4,4,annuity due,annuity whose payments occur at the start of...,prin,0.0,4,44.41,9.6,0.0,8.50,6.2,10.45,4.0,6.0,12.00,9th and 10th grade,88.6,84.70,46.11,3.4,71.000000,74.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8403,460,yield curve,A plot of the interest rates for par-ticular t...,fmi,0.0,8403,55.24,9.5,0.0,9.97,9.6,13.31,3.0,12.0,13.90,9th and 10th grade,94.5,89.60,43.53,3.3,59.000000,65.16
8404,461,yield to maturity,The interest rate that equates the present val...,fmi,0.0,8404,51.18,11.1,0.0,12.59,13.1,9.37,5.0,12.0,12.00,11th and 12th grade,90.4,90.27,38.25,3.2,52.000000,48.70
8405,462,zero-coupon bond,See discount bond.,fmi,0.0,8405,93.81,0.9,0.0,3.43,5.2,14.31,1.0,0.5,1.20,0th and 1st grade,125.8,120.77,45.65,-3.8,135.666667,84.90
8406,463,zero-coupon securities,See Separate Trading of Registered Interest an...,fmi,0.0,8406,10.56,14.3,0.0,19.52,16.1,13.61,5.0,9.0,20.00,19th and 20th grade,64.6,59.78,27.68,5.3,50.000000,14.12


In [None]:
sorted(all_data_scores_df.columns)

['assigned_readibility',
 'automated_readability_index',
 'coleman_liau_index',
 'crawford',
 'dale_chall_readability_score',
 'definitions',
 'difficult_words',
 'fernandez_huerta',
 'flesch_kincaid_grade',
 'flesch_reading_ease',
 'gulpease_index',
 'gunning_fog',
 'gutierrez_polini',
 'index',
 'index',
 'linsear_write_formula',
 'osman',
 'smog_index',
 'source',
 'szigriszt_pazos',
 'terms',
 'text_standard']

In [None]:
all_data_scores_df[all_data_scores_df['text_standard']!='NOT USE'].drop('index', axis = 1).to_csv('data_with_scores.csv', index = False)