In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
%cd drive/MyDrive/FNS22/Financial-Narrative-Summarization/

/content/drive/MyDrive/FNS22/Financial-Narrative-Summarization


In [None]:
!pip install stanza

In [2]:
import numpy as np
import pandas as pd

import stanza
import spacy
import nltk
import re
import json
import os
from tqdm import tqdm
from bs4 import BeautifulSoup

from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from stop_words import get_stop_words
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.2.2.json: 140kB [00:00, 23.4MB/s]                    
2021-12-12 17:10:55 INFO: Downloading default packages for language: en (English)...
2021-12-12 17:10:57 INFO: File exists: C:\Users\rishi\stanza_resources\en\default.zip.
2021-12-12 17:11:02 INFO: Finished downloading models and saved to C:\Users\rishi\stanza_resources.


In [16]:
spacy_pipline = dict()
spacy_pipline['en'] = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [17]:
stemmer = dict()
stemmer['en'] = SnowballStemmer("english")

In [54]:
def preprocess_text(text, lang='en'):
    #remove html tags
    # text = BeautifulSoup(text, "html.parser").get_text()

    #remove links
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text)

    #remove text between []
    text = re.sub('\[[^]]*\]', '', text)

    #fix contractions
    def decontracted(phrase):

        phrase = re.sub("\u2019", '\'', phrase)
        phrase = re.sub("\u2018", '\'', phrase)
        phrase = re.sub("\u201C", '\"', phrase)
        phrase = re.sub("\u201D", '\"', phrase)

        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)

        return phrase

    text = decontracted(text)
    
    #Remove numbers
    text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "", text)

    #lowercase
    text = text.lower()

    # tokenize the string
    # remove punctuation and special characters   
    text = re.sub(r'[_]', ' ', text)
    text = re.sub(r'[\r|\n|\r\n]+', ' ', text) 
    tokenizer = RegexpTokenizer(r'\w+')
    
    text = tokenizer.tokenize(text)
    
    #Remove stopwords
    stopwords = set(get_stop_words(lang))
    text = [i for i in text if i not in stopwords]
    
    # #Stemming
    # text = [stemmer[lang].stem(word) for word in text]

    #Lemmatize
    text = [word for word in text if word is not None]
    doc = spacy_pipline[lang](' '.join(text))

    if lang == 'ar' or lang == 'tr':
        if (len(doc.sentences) != 0):
            text = [token.lemma for token in doc.sentences[0].words]
    else:
        text = [token.lemma_ for token in doc]

    return text

# Extract Report and Summary

In [3]:
report_dir = './fns2020_dataset/training/annual_reports/'
summaries_dir = "./fns2020_dataset/training/gold_summaries/"

In [4]:
files = os.listdir(report_dir)
data = {'id':[], 'report':[]}

for f in tqdm(files):
    text_file = open(os.path.join(report_dir, f), 'r', encoding = 'utf-8')
    content = text_file.read()

    # pre-process the data
    # data = dict()
    # data['text'] = preprocess_text(content)

    # with open(os.path.join(dest, str(f[0:-3]) + "json"), 'w', encoding='utf-8') as outfile:
    #     json.dump(data, outfile)

    data['id'].append(f[0:-4]) 
    data['report'].append(content) 

  0%|          | 0/3000 [00:00<?, ?it/s]

In [5]:
df_combined = pd.DataFrame.from_dict(data)
df_combined.set_index('id', inplace=True)
df_combined.head()

Unnamed: 0_level_0,report
id,Unnamed: 1_level_1
10023,Registered office\nC/- Emcee \n44 Southampton...
10024,Mediterranean Oil & Gas Plc / Annual Report &...
10025,Mediterranean Oil & Gas Plc / Annual Report &...
10050,ANNUAL REPORT AND \nACCOUNTS 2013 Download th...
10051,MEGGITT PLC ANNUAL REPORT AND ACCOUNTS | 20...


In [6]:
summ = os.listdir(summaries_dir)

In [7]:
for f in tqdm(summ):
  id = f[0:-4].split('_')[0]
  num = f[0:-4].split('_')[1]

  text_file = open(os.path.join(summaries_dir, f), 'r', encoding = 'utf-8')
  content = text_file.read()

  df_combined.loc[id, 'summary_' + str(num)] = content

  0%|          | 0/9873 [00:00<?, ?it/s]

In [10]:
df_combined.head()

Unnamed: 0_level_0,report,summary_1,summary_2,summary_3,summary_4,summary_5,summary_6,summary_7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10023,Registered office\nC/- Emcee \n44 Southampton...,08\nMediterranean Oil & Gas Plc Annual Report...,01\nMediterranean Oil & Gas Plc Annual Report...,06\nMediterranean Oil & Gas Plc Annual Report...,02\nMediterranean Oil & Gas Plc Annual Report...,,,
10024,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,Business Review Corporate Governance Financia...,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,,,
10025,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,,,
10050,ANNUAL REPORT AND \nACCOUNTS 2013 Download th...,4\nMEGGITT PLC REPORT AND ACCOUNTS 2013\nChie...,Financial highlights 2\nMeggitt’s 2013 result...,3 Chairman’s statement\nOxford University’s S...,,,,
10051,MEGGITT PLC ANNUAL REPORT AND ACCOUNTS | 20...,“\n”\n6 MEGGITT PLC REPORT AND ACCOU...,Financial highlights\nMeggitt’s 2014 results ...,Chairman’s statement\nGrowing the Group\nIt’s...,,,,


In [8]:
# df_combined.drop(['summary_2', 'summary_3', 'summary_4', 'summary_5', 'summary_6', 'summary_7'], axis=1)

In [56]:
def common_unigram(text1, text2):
  if text1 is np.nan or text2 is np.nan:
      return 0

  return len(set(text1) & set(text2))

In [None]:
s1 = preprocess_text(df_combined.iloc[0]['summary_1'])
s2 = preprocess_text(df_combined.iloc[0]['summary_2'])

print('summary_1', s1)
print('\n\nsummary_2', s2)
print(common_unigram(s1, s2))

In [78]:
best_summary = dict()

for index, row in tqdm(df_combined.iterrows(), total=len(df_combined)):
    overlap_matrix = []
    
    best_overlap = -np.inf
    best_index = 1

    for i in range(1, 8):
        if (pd.isnull(row["summary_" + str(i)])):
            break

        summary_a = preprocess_text(row["summary_" + str(i)])

        if (len(summary_a) == 0):
            break

        temp = []
        for j in range(1, 8):
            if (pd.isnull(row["summary_" + str(j)])):
                break
            
            summary_b = preprocess_text(row["summary_" + str(j)])

            if (len(summary_b) == 0):
                break
            
            overlap = common_unigram(summary_a, summary_b)
            temp.append(overlap)

        score = sum(temp) / len(summary_a)

        if (best_overlap < score):
            best_overlap = score
            best_index = i
        
    best_summary[index] = (best_index, best_overlap)

  0%|          | 0/3000 [00:00<?, ?it/s]

In [79]:
best_summary

{'10023': (2, 1.7639484978540771),
 '10024': (2, 1.7085201793721974),
 '10025': (2, 1.7076923076923076),
 '10050': (2, 1.396039603960396),
 '10051': (2, 1.5204081632653061),
 '10063': (2, 1.6265060240963856),
 '10064': (2, 1.3426966292134832),
 '10065': (2, 1.3711340206185567),
 '10066': (2, 1.5726872246696035),
 '10067': (2, 1.3483870967741935),
 '10068': (2, 1.1923076923076923),
 '10069': (2, 1.1398963730569949),
 '1007': (2, 1.3944954128440368),
 '10070': (4, 1.063660477453581),
 '10071': (1, 1.4955752212389382),
 '10073': (2, 1.6901408450704225),
 '1008': (2, 1.28125),
 '10120': (2, 1.2727272727272727),
 '10121': (2, 1.1865671641791045),
 '10122': (2, 1.6388888888888888),
 '10123': (4, 2.4583333333333335),
 '1013': (2, 1.095890410958904),
 '1014': (2, 1.070754716981132),
 '1015': (2, 1.4658385093167703),
 '1016': (4, 2.3191489361702127),
 '10196': (3, 1.0682492581602374),
 '10197': (3, 0.9901315789473685),
 '10199': (3, 1.251063829787234),
 '102': (3, 1.0589390962671905),
 '10200':

In [81]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [4]:
df_combined = pd.read_csv('./train_v1.csv')

In [11]:
df_combined

Unnamed: 0,id,report,summary_1,summary_2,summary_3,summary_4,summary_5,summary_6,summary_7,best_summary_index,best_summary_score
0,10023,Registered office\nC/- Emcee \n44 Southampton...,08\nMediterranean Oil & Gas Plc Annual Report...,01\nMediterranean Oil & Gas Plc Annual Report...,06\nMediterranean Oil & Gas Plc Annual Report...,02\nMediterranean Oil & Gas Plc Annual Report...,,,,2,1.763948
1,10024,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,Business Review Corporate Governance Financia...,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,,,,2,1.708520
2,10025,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,Mediterranean Oil & Gas Plc / Annual Report &...,,,,2,1.707692
3,10050,ANNUAL REPORT AND \nACCOUNTS 2013 Download th...,4\nMEGGITT PLC REPORT AND ACCOUNTS 2013\nChie...,Financial highlights 2\nMeggitt’s 2013 result...,3 Chairman’s statement\nOxford University’s S...,,,,,2,1.396040
4,10051,MEGGITT PLC ANNUAL REPORT AND ACCOUNTS | 20...,“\n”\n6 MEGGITT PLC REPORT AND ACCOU...,Financial highlights\nMeggitt’s 2014 results ...,Chairman’s statement\nGrowing the Group\nIt’s...,,,,,2,1.520408
...,...,...,...,...,...,...,...,...,...,...,...
2995,9936,Mears Group PLC Annual report and accounts...,06 / Mears Group PLC / Annual report and acco...,Mears Group PLC Annual report and accounts...,02 / Mears Group PLC / Annual report and acco...,Mears is... a leading provider \nof integrate...,,,,2,2.285714
2996,9937,Making \na positive \ndifference...\nMears Gr...,08 / Mears Group PLC / Annual report and acco...,01 / Mears Group PLC / Annual report and acco...,04 / Mears Group PLC / Annual report and acco...,,,,,2,1.214286
2997,9944,Annual report and accounts 2008\nMaking conte...,14 Mecom Group plc Annual report and accounts...,13\nOverview\nGroup\nhighlights\nEquity issue...,12 Mecom Group plc Annual report and accounts...,,,,,3,1.244681
2998,9977,report and financial \nstatements 2004\nUK\nM...,chief executive’s report\nThe year ended 30 S...,MEDIASURFACE PLC Report and Financial Stateme...,chairman’s statement\nThe year 2004 included ...,,,,,3,1.219251


In [113]:
df_combined['best_summary_index'] = df_combined.progress_apply(
    lambda row: best_summary[str(row.id)][0],
    axis=1
)

  0%|          | 0/3000 [00:00<?, ?it/s]

In [115]:
df_combined['best_summary_score'] = df_combined.progress_apply(
    lambda row: best_summary[str(row.id)][1],
    axis=1
)

  0%|          | 0/3000 [00:00<?, ?it/s]

In [12]:
df_combined.to_csv('train_v1.csv')