# N01-Transcript_to_facts
*Purpose*: To convert raw extracted shark tank episode transcripts (from derrick) into facts dictionary which contains 5 portion. The 5 portion can then be used as our dataset for downstream models

    1. Facts about the company or product that are hard quantifiable facts
    2. Product description
    3. Summary of investor pitch. This should contain information about how the pitch was delivered, the sentiment, the story, and other aspects essential to the pitch.
    4. Entrepreneur initial offer
    5. Final agreed offer by the shark investor

In [None]:
# run this cell if using google colab (kaizhuo G drive)
from google.colab import drive
drive.mount('/content/gdrive/')
%cd /content/gdrive/MyDrive/MITB/CS6xx\ LLM/LLM\ Project

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).
/content/gdrive/MyDrive/MITB/CS6xx LLM/LLM Project


In [3]:
pip install transformers datasets evaluate torch accelerate datasets together openai

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting together
  Downloading together-1.4.1-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvi

In [4]:
# Import required libraries
import torch
import time
import requests
import re
import json
import numpy as np
import pandas as pd
from transformers import pipeline
from datasets import load_dataset
from torch.nn import functional as F
from transformers import AutoTokenizer
from together import Together
from openai import OpenAI
from pathlib import Path

In [5]:
client = OpenAI(api_key="sk-5fb8c6b1d15f4ab1bb4e218980869d8c", base_url="https://api.deepseek.com")

def transcript_to_facts(transcript):
    response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
            {"role": "system",
            "content":
            """
    In this transcript of a shark tank episode, can you extract out this 4 category of information?

    1. Facts about the company or product that are hard quantifiable facts
    2. Product description
    3. Summary of investor pitch. This should contain information about how the pitch was delivered, the sentiment, the story, and other aspects essential to the pitch.
    4. Entrepreneur initial offer
    5. Final agreed offer by the shark investor

    Convert it into a json format.

    The transcript is below:
    """},
            {"role": "user", "content": transcript},
        ],
        stream=False
    )
    response = response.choices[0].message.content
    response = json.loads(response.strip("```json\n").strip("```"))
    return response

In [10]:
# read all transcripts available
folder_path = Path("./Prelim_Data_Split_Clean")
transcript_store = {}
for file in folder_path.glob("*.txt"):
    with file.open("r", encoding="utf-8") as f:
        transcript_store[file.name] = f.read()

# read all facts that has already been processed (so as to avoid processing them again)
facts_store = {}
folder_path = Path("./facts")
for file in folder_path.glob("*.txt"):
    with file.open("r", encoding="utf-8") as f:
        facts_store[file.name] = f.read()

# loop through each transcript, check that it is not in the facts folder, then process them
facts_file_path = './facts'
for transcript_name, transcript in transcript_store.items():
    print(f'Processing: {transcript_name}')
    if f'facts_{transcript_name}' in facts_store:
        print(f'{transcript_name} has already been processed, skipping ... ...')
        continue
    else:
        timenow = time.time()
        facts = transcript_to_facts(transcript)
        print(f'Saving facts: {transcript_name}. Latency: {time.time()-timenow}')
        with open(f'{facts_file_path}/facts_{transcript_name}', "w", encoding="utf-8") as file:
            json.dump(facts, file, indent=4)  # `indent=4` makes it readable


Processing: shark_tank_transcript_0_TouchUp Cup.txt
shark_tank_transcript_0_TouchUp Cup.txt has already been processed, skipping ... ...
Processing: shark_tank_transcript_0_Roadie.txt
shark_tank_transcript_0_Roadie.txt has already been processed, skipping ... ...
Processing: shark_tank_transcript_0_GarmaGuard.txt
shark_tank_transcript_0_GarmaGuard.txt has already been processed, skipping ... ...
Processing: shark_tank_transcript_1_BootayBag.txt
shark_tank_transcript_1_BootayBag.txt has already been processed, skipping ... ...
Processing: shark_tank_transcript_1_GoOats.txt
shark_tank_transcript_1_GoOats.txt has already been processed, skipping ... ...
Processing: shark_tank_transcript_3_Electra.txt
shark_tank_transcript_3_Electra.txt has already been processed, skipping ... ...
Processing: shark_tank_transcript_3_All33.txt
shark_tank_transcript_3_All33.txt has already been processed, skipping ... ...
Processing: shark_tank_transcript_3_His & Her Bar.txt
shark_tank_transcript_3_His & Her

In [11]:
len(transcript_store)

119