## DATA COLLECTION AND PROCESSING

In [50]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import re

def clean_text(text):

    text = re.sub(r'^Q\.\s*', '', text.strip())

    text = re.sub(r'^[A-Z\s]+:', '', text.strip())

    text = re.sub(r'[^\w\s.?!\'"]', '', text)

    return text

def scrape_interview(interview_id):
    url = f"https://www.asapsports.com/show_interview.php?id={interview_id}"

    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        q = []
        a = []

        #print(soup.get_text())

        lines = soup.get_text().split('\n')
        question = False
        for line in lines:
          if 'Q.' in line:
            q.append(clean_text(line))
            question = True
          elif question == True and (':' in line):
            a.append(clean_text(line))
            question = False

        min_length = min(len(q), len(a))
        a = a[:min_length]
        q = q[:min_length]

        return q, a

    else:
        print(f"Failed ID: {interview_id}")
        return [], []

interview_ids = range(193074, 193174)  # Example list of interview IDs

all_questions = []
all_answers = []

for interview_id in tqdm(interview_ids):
    questions, answers = scrape_interview(interview_id)
    all_questions.extend(questions)
    all_answers.extend(answers)

for i in range(len(all_questions)):
    print(f"Question {i + 1}: {all_questions[i]}")
    print(f"Answer {i + 1}: {all_answers[i]}")
    print()

100%|██████████| 100/100 [00:52<00:00,  1.91it/s]


Question 1: Just your thoughts on the court out there the venue and your thoughts on playing these games the final few games in Las Vegas.
Answer 1:    My thoughts on the court is it looks like a stage.  That looks dope.  This whole experience I think its a good steppingstone for us so were excited for this journey.  Were excited to go out there and compete and hopefully get the win.

Question 2: BI you just mentioned how you guys arent on national TV a lot.  CJ referenced that earlier.  How would you describe this group of guys to people that arent familiar with the Pelicans in terms of how much fun you guys have and kind of the bond and some of the entertaining stuff that you do throughout the season?
Answer 2:    Goofy.  We come to work we all have fun.  Off the court on the court we all have fun.  Were all around the same age.

Question 3: You mentioned that you guys like to have fun.  Have you thought about how youre going to celebrate if youre able to pull this off?
Answer 3:    

## Tokenizing for GPT-2 Input

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

formatted_data = []
for i in range(len(all_questions)):
    q_tokens = tokenizer.encode(all_questions[i], add_special_tokens=False)
    a_tokens = tokenizer.encode(all_answers[i], add_special_tokens=False)
    formatted_data.append(q_tokens + [tokenizer.eos_token_id] + a_tokens)

# Save formatted data to a file or use it for training
with open('formatted_data.txt', 'w') as file:
    for tokens in formatted_data:
        text = tokenizer.decode(tokens)
        file.write(text + '\n')

## Tokenizing for BERT Input

In [None]:
from transformers import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

bert_formatted_data = []
for i in range(len(all_questions)):
    q_tokens = tokenizer.encode(all_questions[i], add_special_tokens=True, truncation=True, padding='max_length', max_length=128)
    a_tokens = tokenizer.encode(all_answers[i], add_special_tokens=True, truncation=True, padding='max_length', max_length=128)

    # Create segment IDs: 0 for the question segment, 1 for the answer segment
    segment_ids = [0] * len(q_tokens) + [1] * len(a_tokens)

    # Combine tokens and segment IDs
    combined_ids = q_tokens + a_tokens
    bert_formatted_data.append((combined_ids, segment_ids))

## Tokenizing for t5 Input

In [None]:
t5_formatted_data = []
for i in range(len(all_questions)):
    # Formulate the data as text-to-text by concatenating question and answer
    input_text = "question: " + all_questions[i] + " context: " + all_answers[i]
    target_text = all_answers[i]  # Set the target text to the answer

    t5_formatted_data.append((input_text, target_text))


## Textual Preprocessing for RNN

In [None]:
def maxi(sents, others):
    for x in range(100):
        nmax = 0
        to_remove = None
        for sent in sents:
            if len(sent) > nmax:
                nmax = len(sent)
                to_remove = sents.index(sent)
        sents.remove(sents[to_remove])
        others.remove(sents[to_remove])
    return others, sents

def bracket_sentence(sent):
    sent = ['<s>'] + sent + ['</s>']
    return sent

all_questions, all_answers = maxi(all_answers, all_questions)

for x in range(len(all_questions)):
    all_questions[x] = bracket_sentence(all_questions[x])
    all_answers[x] = bracket_sentence(all_answers[x])