In [1]:
import pandas as pd
import numpy as np
import os
import re
from sentence_transformers import SentenceTransformer, util
import torch
from collections import defaultdict
from transformers import pipeline

## Phase 1: Load Dataset & Generate Embedding


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

folder_path = '/content/drive/My Drive/IC/'

file_names = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

frames = []
for name in file_names:

    year_match = re.search(r'(\d{4})', name)

    year = int(year_match.group(1))
    temp_df = pd.read_csv(os.path.join(folder_path, name))
    temp_df['Year'] = year
    frames.append(temp_df)


df = pd.concat(frames, ignore_index=True)
print(f"Successfully loaded {len(frames)} CSV files. Total comments: {len(df)}")
print(f"Years found: {sorted(df['Year'].unique())}")


Mounted at /content/drive


  temp_df = pd.read_csv(os.path.join(folder_path, name))
  temp_df = pd.read_csv(os.path.join(folder_path, name))
  temp_df = pd.read_csv(os.path.join(folder_path, name))
  temp_df = pd.read_csv(os.path.join(folder_path, name))
  temp_df = pd.read_csv(os.path.join(folder_path, name))
  temp_df = pd.read_csv(os.path.join(folder_path, name))


Successfully loaded 6 CSV files. Total comments: 375038
Years found: [np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022)]


In [3]:
hcf_dimensions = {
    "Direct Management": """A supervisor-subordinate relationship is inherently hierarchal, which also
    means that it is inherently complex with multiple potential points of failure. Getting instructions
    about what to do and how to do it are never easy as it decreases the sense of autonomy in
    multiple ways. In this dimension we assess the positive qualities of these relationships with an
    emphasis on the working relationship, the support of employee development and on general
    caring for the employee. In general, positive manager-employee relationships encourages
    productivity and collaboration among teams. When there’s mutual respect, care, and
    communications between a manager and an employee, there’s more willingness on both ends to
    offer support and perform well.""",

    "Organizational Alignment": """Organizational alignment is a shared understanding of the positive
    mission, philosophy and approaches that underlie the path and methods of any company. It
    allows all members of an organization, from entry-level positions to executive managers, to
    share common goals and vision for the organization and to be proud of their joint mission. In
    this dimension we assess the connection employees have with the values of the company, the
    sense of mission, and the meaning that they get from their workplace. In general, good
    organizational alignment helps with both motivation toward common organizational goals, a
    shared understanding of how to treat each other to accomplish goals, and the coordination of all
    actions and actors toward that goal.""",

    "Engagement": """Engagement is the level of commitment and emotional investment that employees
    have toward their job and their organization. Engagement is not just about job satisfaction or
    happiness, it is also about the level of involvement with their job, colleagues, and organization.
    Engaged employees are enthusiastic about their work, are willing to go above and beyond what
    is expected of them and are more likely to be loyal to their employer. In this dimension we
    assess motivation, connection with the company, and the ability of the employee to reach their
    potential to address the difficulties present in any job. In general, high employee engagement is
    crucial for creating a positive and productive workplace culture that fosters growth and success
    for both employees and the company.""",

    "Innovation": """Innovation is the process of creating something new or improved. It involves
    finding, testing, and improving novel and creative solutions to problems or challenges. It also
    involves developing new ideas, products, or services. Innovation is an important driver of
    progress and growth for all companies. In this dimension we assess the company’s approach to
    innovation and the degree to which it implicitly and explicitly encourages or discourages it, from
    a company’s treatment of mistakes to the outright acceptance of new ideas. In general, the more
    innovative a company is, the more likely it is to be successful over time.""",

    "Organizational Effectiveness": """Organizational effectiveness is the ability of a group to achieve
    its goals and objectives efficiently and with little waste (inputs, effort, time, energy, attention).
    Organizational effectiveness is a reflection of how well an organization uses its human and non-human
    resources to achieve its mission and objectives. In this dimension we assess whether
    people are given the tools they need for their jobs, the degree of collaboration, and the level of
    bureaucratic burden. In general, organizations with a high degree of organizational
    effectiveness are like well-oiled machines, providing a high level of output with inputs available.""",

    "Emotional Connection": """Emotional connection is the feeling of being emotionally attached,
    invested, and engaged with the workplace and co-workers. When employees feel emotionally
    connected, they are more likely to invest time, energy, and resources. Emotional connection can
    also foster trust, loyalty, and commitment. In this dimension we assess the desire to stay at the
    workplace, see their futures as intertwined, and the desire to recommend the workplace to others.
    In general, high emotional connection brings a sense of connection, satisfaction, productivity,
    purpose, and fulfillment.""",

    "Extrinsic Rewards": """Extrinsic rewards refer to rewards that are external to an individual such as
    money, prizes, or recognition. These rewards are intended to motivate the individual to perform
    better or continue engaging in a certain behavior, but do they?
    Extrinsic rewards have been shown to be effective in motivating individuals in the short term.
    However, extrinsic rewards have some limitations and are often ineffective in motivating
    individuals in the long term, as they do not address intrinsic motivation or interest in the work
    itself. In some cases, extrinsic rewards can have a negative effect on intrinsic motivation as
    individuals become over-focused on the reward itself. In this dimension we assess
    compensation, opportunities for advancement, and benefits. In general, when we think about
    rewards, we usually think about a set of extrinsic rewards."""
}

In [4]:
print("\nLoading Sentence-BERT model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")

# embeddings for HCF
print("Generating embeddings for HCF dimensions")
dimension_names = list(hcf_dimensions.keys())
dimension_descriptions = list(hcf_dimensions.values())
hcf_embeddings = model.encode(dimension_descriptions, convert_to_tensor=True)
print("HCF embeddings generated.")

df.dropna(subset=['PROs', 'CONs'], how='all', inplace=True)
df['PROs'] = df['PROs'].fillna('')
df['CONs'] = df['CONs'].fillna('')


# embedding for all comments (PROs and CONs)
print("\nGenerating embeddings for all comments.")
# combine all comments into one list
all_pros = df['PROs'][df['PROs'] != ''].tolist()
all_cons = df['CONs'][df['CONs'] != ''].tolist()

# encode PROs and CONs separately to map them back easily
pro_embeddings = model.encode(all_pros, convert_to_tensor=True, show_progress_bar=True)
con_embeddings = model.encode(all_cons, convert_to_tensor=True, show_progress_bar=True)

# create dictionaries to map comments back to their embeddings
pro_embedding_map = {comment: emb for comment, emb in zip(all_pros, pro_embeddings)}
con_embedding_map = {comment: emb for comment, emb in zip(all_cons, con_embeddings)}
print("All comment embeddings generated and mapped.")


Loading Sentence-BERT model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded.
Generating embeddings for HCF dimensions...


  return forward_call(*args, **kwargs)


HCF embeddings generated.

Generating embeddings for all comments. This may take some time...


Batches:   0%|          | 0/11720 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Batches:   0%|          | 0/11720 [00:00<?, ?it/s]

All comment embeddings generated and mapped.


## Phase 2: Analysis Loop & Helper Function

In [5]:
print("\nLoading abstractive summarization models")
summarizer_t5 = pipeline("summarization", model="t5-small", tokenizer="t5-small")
print("T5 model loaded.")
summarizer_bart = pipeline("summarization", model="facebook/bart-large-cnn")
print("BART model loaded.")


Loading abstractive summarization models


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda:0


T5 model loaded.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


BART model loaded.


In [6]:
def map_comment_to_dimension(comment_embedding, hcf_embeddings, dimension_names):
    """Maps a single comment embedding to the most similar HCF dimension."""
    cosine_scores = util.cos_sim(comment_embedding, hcf_embeddings)
    best_match_index = torch.argmax(cosine_scores).item()
    return dimension_names[best_match_index]

In [15]:
def summarize_text_abstractive(text_list, summarizer_pipeline):
    """
    Generates a true abstractive summary for a list of comments.
    """
    # combine all comments
    full_text = ". ".join(text_list)
    if len(full_text) < 100:
      return full_text # return original text if too short

    max_input_length = 1024
    truncated_text = full_text[:max_input_length]

    summary_result = summarizer_pipeline(truncated_text, max_new_tokens=40, min_length=15, do_sample=False)
    return summary_result[0]['summary_text']

In [8]:
def generate_hcf_informed_paragraph(comment_list, embedding_map, summarizer_pipeline):
    """
    Analyzes a list of comments and generates a single summary paragraph
    structured by HCF themes, using the provided summarizer.
    """
    if not comment_list:
        return "No comments provided."

    # map each comment to an HCF dimension
    hcf_buckets = defaultdict(list)
    for comment in comment_list:
        if comment in embedding_map:
            comment_embedding = embedding_map[comment]
            dimension = map_comment_to_dimension(comment_embedding, hcf_embeddings, dimension_names)
            hcf_buckets[dimension].append(comment)

    if not hcf_buckets:
        return "Could not map any comments to HCF dimensions."

    # generate an abstractive "micro-summary" for each HCF dimension bucket
    micro_summaries = []
    for dimension in dimension_names: # Iterate in a fixed order
        if dimension in hcf_buckets:
            comments_in_bucket = hcf_buckets[dimension]
            # summary for this bucket
            bucket_summary = summarize_text_abstractive(comments_in_bucket, summarizer_pipeline)
            micro_summaries.append(f"{dimension}: {bucket_summary}")

    # assemble the final paragraph
    return " ".join(micro_summaries)

In [16]:
print("\nStarting T5 and BART summarization...")
analysis_results = []


#group ticker
grouped = df.groupby(['Ticker Symbol', 'Year'])

for (ticker, year), group in grouped:
    print(f"Processing {ticker} for year {year}...")

    # segregate comments for the current group
    pros_list = group['PROs'][group['PROs'] != ''].tolist()
    cons_list = group['CONs'][group['CONs'] != ''].tolist()

    # analyze and summarize each side using BOTH models
    pro_summary_t5 = generate_hcf_informed_paragraph(pros_list, pro_embedding_map, summarizer_t5)
    con_summary_t5 = generate_hcf_informed_paragraph(cons_list, con_embedding_map, summarizer_t5)

    pro_summary_bart = generate_hcf_informed_paragraph(pros_list, pro_embedding_map, summarizer_bart)
    con_summary_bart = generate_hcf_informed_paragraph(cons_list, con_embedding_map, summarizer_bart)

    # store result
    analysis_results.append({
        'Ticker': ticker,
        'Year': year,
        'PROs_Summary_T5': pro_summary_t5,
        'CONs_Summary_T5': con_summary_t5,
        'PROs_Summary_BART': pro_summary_bart,
        'CONs_Summary_BART': con_summary_bart,
    })
print("\nAnalysis complete.")



Starting final analysis loop with T5 and BART summarization...
Processing nasdaq:aapl for year 2017...
Processing nasdaq:aapl for year 2018...
Processing nasdaq:aapl for year 2019...
Processing nasdaq:aapl for year 2020...
Processing nasdaq:aapl for year 2021...
Processing nasdaq:aapl for year 2022...
Processing nasdaq:adsk for year 2017...


Your max_length is set to 200, but your input_length is only 196. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=98)
Your max_length is set to 200, but your input_length is only 193. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 200, but your input_length is only 158. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)


Processing nasdaq:adsk for year 2018...


Your max_length is set to 200, but your input_length is only 123. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)
Your max_length is set to 200, but your input_length is only 167. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=83)
Your max_length is set to 142, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Processing nasdaq:adsk for year 2019...


Your max_length is set to 200, but your input_length is only 161. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=80)


Processing nasdaq:adsk for year 2020...


Your max_length is set to 200, but your input_length is only 131. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=65)
Your max_length is set to 142, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)


Processing nasdaq:adsk for year 2021...
Processing nasdaq:adsk for year 2022...
Processing nasdaq:amzn for year 2017...
Processing nasdaq:amzn for year 2018...
Processing nasdaq:amzn for year 2019...
Processing nasdaq:amzn for year 2020...
Processing nasdaq:amzn for year 2021...
Processing nasdaq:amzn for year 2022...
Processing nasdaq:cprt for year 2017...


Your max_length is set to 200, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Your max_length is set to 200, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Your max_length is set to 200, but your input_length is only 158. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)
Your max_length is set to 200, but your input_length is only 147. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=73)
Yo

Processing nasdaq:cprt for year 2018...


Your max_length is set to 200, but your input_length is only 177. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=88)
Your max_length is set to 200, but your input_length is only 125. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)
Your max_length is set to 200, but your input_length is only 34. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=17)
Your max_length is set to 200, but your input_length is only 125. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)
Y

Processing nasdaq:cprt for year 2019...


Your max_length is set to 200, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)
Your max_length is set to 200, but your input_length is only 144. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=72)
Your max_length is set to 200, but your input_length is only 191. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)
Your max_length is set to 200, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Y

Processing nasdaq:cprt for year 2020...


Your max_length is set to 200, but your input_length is only 178. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=89)
Your max_length is set to 200, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 200, but your input_length is only 70. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max_length is set to 200, but your input_length is only 130. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=65)
Y

Processing nasdaq:cprt for year 2021...


Your max_length is set to 200, but your input_length is only 157. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=78)


Processing nasdaq:cprt for year 2022...


Your max_length is set to 200, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
Your max_length is set to 142, but your input_length is only 73. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)


Processing nasdaq:csgp for year 2017...
Processing nasdaq:csgp for year 2018...
Processing nasdaq:csgp for year 2019...


Your max_length is set to 200, but your input_length is only 183. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=91)
Your max_length is set to 200, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 200, but your input_length is only 109. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 142, but your input_length is only 85. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
Yo

Processing nasdaq:csgp for year 2020...


Your max_length is set to 200, but your input_length is only 168. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=84)
Your max_length is set to 200, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Your max_length is set to 142, but your input_length is only 75. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)


Processing nasdaq:csgp for year 2021...
Processing nasdaq:csgp for year 2022...
Processing nasdaq:fox for year 2018...


Your max_length is set to 200, but your input_length is only 53. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)
Your max_length is set to 200, but your input_length is only 183. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=91)
Your max_length is set to 142, but your input_length is only 57. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)


Processing nasdaq:fox for year 2019...


Your max_length is set to 200, but your input_length is only 74. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Your max_length is set to 200, but your input_length is only 109. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 200, but your input_length is only 129. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 200, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
Yo

Processing nasdaq:fox for year 2020...


Your max_length is set to 200, but your input_length is only 152. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=76)
Your max_length is set to 200, but your input_length is only 104. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 200, but your input_length is only 100. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 200, but your input_length is only 177. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=88)


Processing nasdaq:fox for year 2021...


Your max_length is set to 200, but your input_length is only 182. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=91)


Processing nasdaq:fox for year 2022...


Your max_length is set to 200, but your input_length is only 165. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=82)


Processing nasdaq:goog for year 2017...
Processing nasdaq:goog for year 2018...
Processing nasdaq:goog for year 2019...
Processing nasdaq:goog for year 2020...
Processing nasdaq:goog for year 2021...
Processing nasdaq:goog for year 2022...
Processing nasdaq:msft for year 2017...
Processing nasdaq:msft for year 2018...
Processing nasdaq:msft for year 2019...
Processing nasdaq:msft for year 2020...
Processing nasdaq:msft for year 2021...
Processing nasdaq:msft for year 2022...
Processing nasdaq:nws for year 2017...
Processing nasdaq:nws for year 2018...


Your max_length is set to 200, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 200, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 142, but your input_length is only 108. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 142, but your input_length is only 109. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)


Processing nasdaq:nws for year 2019...


Your max_length is set to 200, but your input_length is only 151. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 142, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)


Processing nasdaq:nws for year 2020...


Your max_length is set to 200, but your input_length is only 139. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)
Your max_length is set to 142, but your input_length is only 127. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)


Processing nasdaq:nws for year 2021...


Your max_length is set to 200, but your input_length is only 192. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 200, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 142, but your input_length is only 82. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=41)


Processing nasdaq:nws for year 2022...


Your max_length is set to 200, but your input_length is only 193. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=96)
Your max_length is set to 200, but your input_length is only 166. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=83)
Your max_length is set to 200, but your input_length is only 91. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 200, but your input_length is only 198. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=99)
Y

Processing nasdaq:sbux for year 2017...
Processing nasdaq:sbux for year 2018...
Processing nasdaq:sbux for year 2019...


Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors


Processing nasdaq:sbux for year 2020...
Processing nasdaq:sbux for year 2021...
Processing nasdaq:sbux for year 2022...
Processing nasdaq:wtw for year 2017...
Processing nasdaq:wtw for year 2018...
Processing nasdaq:wtw for year 2019...
Processing nasdaq:wtw for year 2020...
Processing nasdaq:wtw for year 2021...
Processing nasdaq:wtw for year 2022...
Processing nasdaq:z for year 2017...


Your max_length is set to 200, but your input_length is only 135. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)
Your max_length is set to 142, but your input_length is only 131. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=65)


Processing nasdaq:z for year 2018...


Your max_length is set to 200, but your input_length is only 96. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
Your max_length is set to 200, but your input_length is only 126. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 142, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
Your max_length is set to 142, but your input_length is only 114. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


Processing nasdaq:z for year 2019...


Your max_length is set to 200, but your input_length is only 55. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 142, but your input_length is only 53. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=26)


Processing nasdaq:z for year 2020...


Your max_length is set to 200, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 200, but your input_length is only 195. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)
Your max_length is set to 200, but your input_length is only 179. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=89)
Your max_length is set to 142, but your input_length is only 104. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


Processing nasdaq:z for year 2021...
Processing nasdaq:z for year 2022...
Processing nyse:acn for year 2017...
Processing nyse:acn for year 2018...
Processing nyse:acn for year 2019...
Processing nyse:acn for year 2020...
Processing nyse:acn for year 2021...
Processing nyse:acn for year 2022...


Your max_length is set to 200, but your input_length is only 195. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)


Processing nyse:bki for year 2017...


Your max_length is set to 200, but your input_length is only 109. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=54)
Your max_length is set to 142, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Processing nyse:bki for year 2018...


Your max_length is set to 200, but your input_length is only 186. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=93)
Your max_length is set to 200, but your input_length is only 146. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=73)
Your max_length is set to 142, but your input_length is only 130. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=65)


Processing nyse:bki for year 2019...


Your max_length is set to 200, but your input_length is only 189. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=94)
Your max_length is set to 200, but your input_length is only 184. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=92)


Processing nyse:bki for year 2020...


Your max_length is set to 200, but your input_length is only 135. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)
Your max_length is set to 200, but your input_length is only 137. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=68)
Your max_length is set to 200, but your input_length is only 56. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=28)
Your max_length is set to 200, but your input_length is only 81. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)
Yo

Processing nyse:bki for year 2021...


Your max_length is set to 200, but your input_length is only 80. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=40)
Your max_length is set to 200, but your input_length is only 169. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=84)
Your max_length is set to 142, but your input_length is only 75. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)


Processing nyse:bki for year 2022...


Your max_length is set to 200, but your input_length is only 174. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)


Processing nyse:dis for year 2017...
Processing nyse:dis for year 2018...
Processing nyse:dis for year 2019...
Processing nyse:dis for year 2020...
Processing nyse:dis for year 2021...
Processing nyse:dis for year 2022...
Processing nyse:nke for year 2017...
Processing nyse:nke for year 2018...
Processing nyse:nke for year 2019...
Processing nyse:nke for year 2020...
Processing nyse:nke for year 2021...
Processing nyse:nke for year 2022...
Processing nyse:pfe for year 2017...
Processing nyse:pfe for year 2018...
Processing nyse:pfe for year 2019...
Processing nyse:pfe for year 2020...
Processing nyse:pfe for year 2021...
Processing nyse:pfe for year 2022...
Processing nyse:qsr for year 2017...
Processing nyse:qsr for year 2018...
Processing nyse:qsr for year 2019...
Processing nyse:qsr for year 2020...
Processing nyse:qsr for year 2021...
Processing nyse:qsr for year 2022...
Processing nyse:wwe for year 2017...


Your max_length is set to 200, but your input_length is only 195. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)
Your max_length is set to 200, but your input_length is only 72. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=36)
Your max_length is set to 200, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 200, but your input_length is only 142. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)
Yo

Processing nyse:wwe for year 2018...


Your max_length is set to 200, but your input_length is only 75. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
Your max_length is set to 200, but your input_length is only 44. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=22)
Your max_length is set to 200, but your input_length is only 66. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 200, but your input_length is only 39. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=19)
Your

Processing nyse:wwe for year 2019...


Your max_length is set to 200, but your input_length is only 65. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)
Your max_length is set to 200, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 200, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 200, but your input_length is only 147. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=73)
You

Processing nyse:wwe for year 2020...


Your max_length is set to 200, but your input_length is only 54. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
Your max_length is set to 200, but your input_length is only 87. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=43)
Your max_length is set to 200, but your input_length is only 65. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=32)
Your max_length is set to 200, but your input_length is only 59. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)
Your

Processing nyse:wwe for year 2021...


Your max_length is set to 200, but your input_length is only 175. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Your max_length is set to 200, but your input_length is only 98. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=49)
Your max_length is set to 200, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Your max_length is set to 200, but your input_length is only 74. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=37)
You

Processing nyse:wwe for year 2022...


Your max_length is set to 200, but your input_length is only 51. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
Your max_length is set to 200, but your input_length is only 114. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 200, but your input_length is only 61. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=30)
Your max_length is set to 200, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
You


Analysis complete.


In [17]:
# create dataframe for results
if analysis_results:
    results_df = pd.DataFrame(analysis_results)
else:
    results_df = pd.DataFrame()

In [19]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
results_df

Unnamed: 0,Ticker,Year,PROs_Summary_T5,CONs_Summary_T5,PROs_Summary_BART,CONs_Summary_BART
0,nasdaq:aapl,2017,"Direct Management: good environment, almost ev...",Direct Management: trabalha-se 44h semanais o ...,"Direct Management: Good environment, almost ev...",Direct Management: The CEO and CFO of finance ...
1,nasdaq:aapl,2018,Direct Management: Sehr gute Bezahlung im Verg...,Direct Management: midlevel managers believe i...,"Direct Management: Stable leadership, great wo...",Direct Management: Midlevel managers are terri...
2,nasdaq:aapl,2019,"Direct Management: otima experiencia, pessoas ...",Direct Management: management tendiert dazu ma...,Direct Management: Gutes Gehalt- Gute Arbeitsb...,Direct Management: The senior managers have no...
3,nasdaq:aapl,2020,Direct Management: molti telefoni anche a bass...,"Direct Management: nog niet opgemerkt, denk ni...",Direct Management: Great product (but needs TL...,Direct Management: Cult and brain washing cult...
4,nasdaq:aapl,2021,Direct Management: a gran empresa que cuida de...,Direct Management: eu acho que o salario dever...,Direct Management: The company offers good ben...,Direct Management: There seemed to be some fav...
5,nasdaq:aapl,2022,Direct Management: the people you meet and get...,Direct Management: contract workers are kept s...,Direct Management: Great work environment and ...,Direct Management: It's perfect and the owners...
6,nasdaq:adsk,2017,"Direct Management: nice salary, good training,...",Direct Management: the company retrenched empl...,"Direct Management: Good benefits, flexibility,...",Direct Management: India team is going through...
7,nasdaq:adsk,2018,Direct Management: great colleagues - encourag...,Direct Management: the role of people manager ...,"Direct Management: Good pay, good work life ba...",Direct Management: HR is just there to help ma...
8,nasdaq:adsk,2019,Direct Management: empresa que cuida muco de s...,Direct Management: good people leave and peopl...,Direct Management: Work life balance. Location...,Direct Management: Good people leave and Peopl...
9,nasdaq:adsk,2020,Direct Management: a lot of recognition is giv...,Direct Management: ninguna hasta el da de hoy ...,Direct Management: The people are just amazing...,Direct Management: Autodesk is slow to innovat...


In [20]:
def get_summary_for_company(ticker_symbol, results_dataframe):
    company_report = results_dataframe[results_dataframe['Ticker'] == ticker_symbol].copy()
    company_report.sort_values('Year', inplace=True)
    return company_report


ticker_to_find = 'nasdaq:aapl'
company_summary = get_summary_for_company(ticker_to_find, results_df)

print(f"\n\n--- Comparing BART and T5 {ticker_to_find} ---")


# display full text
for index, row in company_summary.iterrows():
    print(f"\n{'='*15} Year: {row['Year']} {'='*15}")

    # PROs
    print("\n--- PROs ---")
    print("\nT5 Summary:")
    print(str(row['PROs_Summary_T5']).replace(". ", ".\n"))
    print("\nBART Summary:")
    print(str(row['PROs_Summary_BART']).replace(". ", ".\n"))

    # CONs
    print("\n--- CONs ---")
    print("\nT5 Summary:")
    print(str(row['CONs_Summary_T5']).replace(". ", ".\n"))
    print("\nBART Summary:")
    print(str(row['CONs_Summary_BART']).replace(". ", ".\n"))
    print("-" * 40)




--- Comparison Report for nasdaq:aapl ---


--- PROs ---

T5 Summary:
Direct Management: good environment, almost everybody is nice, can work independently .
partweise nettes Team Interessante Kunden Benefits sind nicht schlecht Relativ entspannte Arbeit Null Entwicklungsmöglichkeiten Organizational Alignment: a cultura é super forte e você melhora como ser humano .
uma empresa muito conceituada Engagement: apple heeft het work-life balance principe volledig geimplementeerd .
krijgen gemiddeld 9 weken vakantie Innovation: a great environment with vibrant brilliant people .
nette Kollegen, tolles Produkt, apple-Tochter, zahlen gut .
Organizational Effectiveness: otimos benefcios e ótimo salário.
Ambiente bem mantido pela empresa .
Fun Emotional Connection: timo ambiente de trabalho, funcionários que prezam pela cultura da empresa e sempre pelo melh Extrinsic Rewards: the student gets all benefits as a full-timer, very good people .
good benefits like gym and medical.
job security wage