## DATA COLLECTION AND PROCESSING

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import re

def clean_text(text):

    text = re.sub(r'^Q\.\s*', '', text.strip())

    text = re.sub(r'^[A-Z\s]+:', '', text.strip())

    text = re.sub(r'[^\w\s.?!\'"]', '', text)

    return text

def scrape_interview(interview_id):
    url = f"https://www.asapsports.com/show_interview.php?id={interview_id}"

    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        q = []
        a = []

        #print(soup.get_text())

        lines = soup.get_text().split('\n')
        question = False
        for line in lines:
          if 'Q.' in line:
            q.append(clean_text(line))
            question = True
          elif question == True and (':' in line):
            a.append(clean_text(line))
            question = False

        min_length = min(len(q), len(a))
        a = a[:min_length]
        q = q[:min_length]

        return q, a

    else:
        print(f"Failed ID: {interview_id}")
        return [], []

interview_ids = range(193074, 193174)  # Example list of interview IDs

all_questions = []
all_answers = []

for interview_id in tqdm(interview_ids):
    questions, answers = scrape_interview(interview_id)
    all_questions.extend(questions)
    all_answers.extend(answers)

for i in range(len(all_questions)):
    print(f"Question {i + 1}: {all_questions[i]}")
    print(f"Answer {i + 1}: {all_answers[i]}")
    print()

100%|██████████| 100/100 [00:45<00:00,  2.22it/s]


Question 1: Just your thoughts on the court out there the venue and your thoughts on playing these games the final few games in Las Vegas.
Answer 1:    My thoughts on the court is it looks like a stage.  That looks dope.  This whole experience I think it's a good steppingstone for us so we're excited for this journey.  We're excited to go out there and compete and hopefully get the win.

Question 2: BI you just mentioned how you guys aren't on national TV a lot.  CJ referenced that earlier.  How would you describe this group of guys to people that aren't familiar with the Pelicans in terms of how much fun you guys have and kind of the bond and some of the entertaining stuff that you do throughout the season?
Answer 2:    Goofy.  We come to work we all have fun.  Off the court on the court we all have fun.  We're all around the same age.

Question 3: You mentioned that you guys like to have fun.  Have you thought about how you're going to celebrate if you're able to pull this off?
Answe

## Tokenizing for GPT-2 Input

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

formatted_data = []
for i in range(len(all_questions)):
    q_tokens = tokenizer.encode(all_questions[i], add_special_tokens=False)
    a_tokens = tokenizer.encode(all_answers[i], add_special_tokens=False)
    formatted_data.append(q_tokens + [tokenizer.eos_token_id] + a_tokens)

# Save formatted data to a file or use it for training
with open('formatted_data.txt', 'w') as file:
    for tokens in formatted_data:
        text = tokenizer.decode(tokens)
        file.write(text + '\n')

## Tokenizing for BERT Input

In [None]:
from transformers import BertTokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

bert_formatted_data = []
for i in range(len(all_questions)):
    q_tokens = tokenizer.encode(all_questions[i], add_special_tokens=True, truncation=True, padding='max_length', max_length=128)
    a_tokens = tokenizer.encode(all_answers[i], add_special_tokens=True, truncation=True, padding='max_length', max_length=128)

    # Create segment IDs: 0 for the question segment, 1 for the answer segment
    segment_ids = [0] * len(q_tokens) + [1] * len(a_tokens)

    # Combine tokens and segment IDs
    combined_ids = q_tokens + a_tokens
    bert_formatted_data.append((combined_ids, segment_ids))

## Tokenizing for t5 Input

In [None]:
t5_formatted_data = []
for i in range(len(all_questions)):
    # Formulate the data as text-to-text by concatenating question and answer
    input_text = "question: " + all_questions[i] + " context: " + all_answers[i]
    target_text = all_answers[i]  # Set the target text to the answer

    t5_formatted_data.append((input_text, target_text))


## Textual Preprocessing for RNN

In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

token_q = []
token_a = []

for sent in all_questions:
  token_q.append(word_tokenize(sent))

for sent in all_answers:
  token_a.append(word_tokenize(sent))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
print(token_q[0])
print(token_a[0])

['Just', 'your', 'thoughts', 'on', 'the', 'court', 'out', 'there', 'the', 'venue', 'and', 'your', 'thoughts', 'on', 'playing', 'these', 'games', 'the', 'final', 'few', 'games', 'in', 'Las', 'Vegas', '.']
['My', 'thoughts', 'on', 'the', 'court', 'is', 'it', 'looks', 'like', 'a', 'stage', '.', 'That', 'looks', 'dope', '.', 'This', 'whole', 'experience', 'I', 'think', 'it', "'s", 'a', 'good', 'steppingstone', 'for', 'us', 'so', 'we', "'re", 'excited', 'for', 'this', 'journey', '.', 'We', "'re", 'excited', 'to', 'go', 'out', 'there', 'and', 'compete', 'and', 'hopefully', 'get', 'the', 'win', '.']


In [9]:
def maxi(sents, others):
    for x in range(100):
        nmax = 0
        to_remove = None
        for sent in sents:
            if len(sent) > nmax:
                nmax = len(sent)
                to_remove = sents.index(sent)
        del sents[to_remove]
        del others[to_remove]
    return others, sents

def bracket_questions(sent):
    sent = ['<q>'] + sent + ['</q>']
    return sent

def bracket_answer(sent):
    sent = ['<a>'] + sent + ['</a>']
    return sent

print(len(token_q))
print(len(token_a))
token_q, token_a = maxi(token_a, token_q)
print(len(token_q))
print(len(token_a))

for x in range(len(token_q)):
    token_q[x] = bracket_questions(token_q[x])
    token_a[x] = bracket_answer(token_a[x])

1040
1040
940
940


In [10]:
from google.colab import drive
drive.mount('/content/drive')

data_dir = 'drive/MyDrive/'

Mounted at /content/drive


In [11]:
vocab = {}
i = 0
for sent in all_questions:
    for token in sent:
        token = token.lower()
        if token not in vocab:
          vocab[token] = i
          i += 1

for sent in all_answers:
    for token in sent:
        token = token.lower()
        if token not in vocab:
          vocab[token] = i
          i += 1

idx_to_token = {idx: token for token, idx in vocab.items()}

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

def load_glove_embeddings(glove_file):
    weights_matrix = torch.zeros(len(vocab), 100)
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.array([float(val) for val in values[1:]])
            embeddings[word] = vector
            if word in vocab:
              idx = vocab[word]
              weights_matrix[idx] = torch.tensor(np.asarray(values[1:], "float32"))
    for word, idx in vocab.items():
      if not torch.any(weights_matrix[idx]):
        weights_matrix[idx] = torch.normal(0.0, 0.6, size=(100, ))

    return weights_matrix, embeddings

weights_matrix, glove_embeddings = load_glove_embeddings(data_dir + './glove.6B.100d.txt')

In [13]:
all_tokens = []
for i in range(len(all_questions)):
  for token in all_questions[i]:
    all_tokens.append(token.lower())
  for token in all_answers[i]:
    all_tokens.append(token.lower())

encode = np.vectorize(lambda w: vocab[w])
encoded = encode(all_tokens)

sequences = []
goals = []

for i in range(len(encoded) - 101):
    sequences.append((encoded[i:i+100]))
    goals.append((encoded[i+1:i+101]))
print(sequences[0])
print(goals[0])

[ 0  1  2  3  4  5  6  1  7  4  3  8  6  1  9  8  3  2  4  6 10  4  3  8
 11  4 12  6  1  7  3  4  6  1  3  4  3  8 11  7 11  4  3  8 11  4 13 11
 10  1 11  4 14 10 15  4  5  6  1  7  4  3  8  6  1  9  8  3  2  4  6 10
  4 16 17 14  5 18 10  9  4  3  8 11  2 11  4  9 14 19 11  2  4  3  8 11
  4 20 18 10]
[ 1  2  3  4  5  6  1  7  4  3  8  6  1  9  8  3  2  4  6 10  4  3  8 11
  4 12  6  1  7  3  4  6  1  3  4  3  8 11  7 11  4  3  8 11  4 13 11 10
  1 11  4 14 10 15  4  5  6  1  7  4  3  8  6  1  9  8  3  2  4  6 10  4
 16 17 14  5 18 10  9  4  3  8 11  2 11  4  9 14 19 11  2  4  3  8 11  4
 20 18 10 14]


In [14]:
class NEW_DS(Dataset):

  def __init__(self, X, Y):
        self.X = X  # Convert X to a PyTorch LongTensor
        self.Y = Y

  def __len__(self):
      return len(self.X)

  def __getitem__(self, idx):

      return torch.tensor(self.X[idx]), torch.tensor(self.Y[idx])


batch_size = 32
dataset = NEW_DS(sequences, goals)

data_loader = DataLoader(dataset, batch_size = 32, shuffle=False)

In [17]:
class MyLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, embedding_weights):
        super(MyLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.embedding.weight = nn.Parameter(embedding_weights)
        self.embedding.weight.requires_grad = True
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.linear(lstm_out)
        return output

In [None]:
vocab_size = len(vocab)
embedding_size = 100
hidden_size = 256

model = MyLSTM(vocab_size, embedding_size, hidden_size, weights_matrix)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch_idx, (inputs, targets) in enumerate(data_loader):
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs.permute(0, 2, 1), targets)  # Permute outputs to match the shape of targets
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(data_loader)}], Loss: {loss.item():.4f}')

    average_loss = total_loss / len(data_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}')

Epoch [1/10], Batch [1/13718], Loss: 3.7962
Epoch [1/10], Batch [101/13718], Loss: 2.7664
Epoch [1/10], Batch [201/13718], Loss: 2.2699
Epoch [1/10], Batch [301/13718], Loss: 2.2106
Epoch [1/10], Batch [401/13718], Loss: 2.1118
Epoch [1/10], Batch [501/13718], Loss: 1.9540
Epoch [1/10], Batch [601/13718], Loss: 1.7230


In [None]:
model.eval()

# Define a starting sequence for prediction
start_sequence = ['this', 'is', 'a']

# Encode the starting sequence to integers
encoded_start_sequence = [vocab[char] for char in start_sequence]

# Convert the encoded start sequence to a PyTorch tensor
inputs = torch.tensor(encoded_start_sequence).unsqueeze(0)  # Add batch dimension

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = inputs.to(device)

# Predict the next character
with torch.no_grad():
    hidden = None
    for i in range(50):  # Predict the next 50 characters
        #print(inputs.shape)
        outputs = model(inputs)
        # Get the output logits for the last character in the sequence
        last_output = outputs[:, -1, :]
        # Get the index of the predicted character
        predicted_index = torch.argmax(last_output, dim=1).item()
        # Map the index back to the character
        predicted = idx_to_token[predicted_index]
        # Print the predicted character
        if predicted == '</s>':
          break
        print(" " + predicted + " ", end='')

        # Update the input sequence for the next prediction
        inputs = torch.cat((inputs, torch.tensor([[predicted_index]]).to(device)), dim=1)