In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers

In [None]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from tqdm import tqdm

In [None]:
import pandas as pd
from transformers import BertTokenizer
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Custom function to split text into sentences
def split_into_sentences(text):
    sentences = []
    sentence = ''
    for char in text:
        sentence += char
        if char in '.!?':
            sentences.append(sentence.strip())
            sentence = ''
    if sentence:
        sentences.append(sentence.strip())
    return sentences

# Function to chunk text by custom sentences while respecting the token limit
def chunk_text_by_sentences(text, tokenizer, max_length):
    sentences = split_into_sentences(text)
    current_chunk = ""
    for sentence in sentences:
        tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=False)
        if len(tokenizer.encode(current_chunk)) + len(tokenized_sentence) < max_length:
            current_chunk += sentence + " "
        else:
            if current_chunk:
                yield current_chunk
            current_chunk = sentence + " "
    if current_chunk:
        yield current_chunk

# Worker function for processing each row
def process_row(row, tokenizer, max_length):
    text_chunks = list(chunk_text_by_sentences(row['text'], tokenizer, max_length))
    return [{'text': chunk, 'score': row['score'], 'chunk_index': row.name} for chunk in text_chunks]

# Preprocess function with multithreading
def preprocess_data(data, tokenizer, max_length, num_threads=10):
    new_rows = []
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Create future tasks for each row
        futures = [executor.submit(process_row, row, tokenizer, max_length) for _, row in data.iterrows()]
        for future in tqdm(as_completed(futures), total=len(futures), desc="Preprocessing Data"):
            new_rows.extend(future.result())
    return pd.DataFrame(new_rows)

# Load datasets
train_data = pd.read_csv('/kaggle/input/sahaj-stat/train.csv')
validation_data = pd.read_csv('/kaggle/input/sahaj-stat/sample_submission.csv')

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
max_length = 512  # Token limit for BERT

# Preprocess training and validation data
train_data_processed = preprocess_data(train_data, tokenizer, max_length)
validation_data_processed = preprocess_data(validation_data, tokenizer, max_length)


In [None]:
# Tokenize the training data
train_texts = train_data_processed['text'].tolist()
train_scores = train_data_processed['score'].values
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')

# Convert scores to PyTorch tensors
train_scores = torch.tensor(train_scores, dtype=torch.float32)

# Tokenize the validation data
val_texts = validation_data_processed['text'].tolist()
val_scores = validation_data_processed['score'].values
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')

# Create DataLoaders for training and validation data
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_scores)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_scores, dtype=torch.float32))
val_dataloader = DataLoader(val_dataset, batch_size=4)

In [None]:

# Load BERT-Large for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=1)
model.train()

# Define loss function and optimizer with weight decay (L2 regularization)
criterion = nn.MSELoss()
weight_decay = 0.1  # Adjust the weight decay hyperparameter as needed
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=weight_decay)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(train_dataloader) * 5)


In [None]:
# Training loop
epochs = 3

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

for epoch in range(epochs):
    model.train()
    total_rmse = 0.0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}'):
        optimizer.zero_grad()
        input_ids, attention_mask, targets = batch
        input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits.view(-1), targets)
        rmse = torch.sqrt(loss)
        rmse.backward()
        optimizer.step()
        scheduler.step()
        total_rmse += rmse.item()

    # Calculate and print RMSE for this epoch
    avg_rmse = total_rmse / len(train_dataloader)
    print(f'Epoch {epoch+1}/{epochs}, RMSE: {avg_rmse:.4f}')



In [None]:
model.save_pretrained('/kaggle/working/bert_large_regression_model')


In [None]:
import torch
import pandas as pd
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load the submission data from sample_submission.csv
submission_data = pd.read_csv('/kaggle/input/sahaj-stat/sample_submission.csv')

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# Tokenize the submission data
submission_encodings = tokenizer(submission_data['text'].tolist(), truncation=True, padding=True, return_tensors='pt')

# Create a DataLoader for submission data
submission_dataset = TensorDataset(submission_encodings['input_ids'], submission_encodings['attention_mask'])
submission_dataloader = DataLoader(submission_dataset, batch_size=4)

# Load the saved model (replace 'bert_large_regression_model' with your model's actual path)
model = BertForSequenceClassification.from_pretrained('/kaggle/working/bert_large_regression_model')

# Use GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Make predictions on the submission data
model.eval()  # Set the model to evaluation mode for inference

# Create a list to store the predicted scores
predicted_scores = []

# Iterate over the submission data with tqdm progress bar
for batch in tqdm(submission_dataloader, desc="Inferencing"):
    with torch.no_grad():
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_scores.extend(outputs.logits.view(-1).cpu().numpy())

# Assuming the preprocessed submission data with 'chunk_index' is available as preprocessed_submission_data
# Add predicted scores to the preprocessed data
preprocessed_submission_data = pd.read_csv('/path/to/your/preprocessed_submission_data.csv')  # Load your preprocessed data
preprocessed_submission_data['predicted_score'] = predicted_scores




In [None]:
# Function to concatenate texts and average scores
def concatenate_and_average(data):
    concatenated_texts = []
    averaged_scores = []
    grouped_data = data.groupby('chunk_index')

    for _, group in grouped_data:
        concatenated_text = ' '.join(group['text'])
        average_score = group['predicted_score'].mean()
        concatenated_texts.append(concatenated_text)
        averaged_scores.append(average_score)

    return pd.DataFrame({'text': concatenated_texts, 'average_score': averaged_scores})

# Apply the function to the preprocessed submission data
final_results = concatenate_and_average(preprocessed_submission_data)

# final_results now contains the concatenated texts and their average scores

In [None]:
# Save the updated submission data to a new CSV file
final_results.to_csv('/kaggle/working/predicted_submission_512_token.csv', index=False)

In [None]:
submission_data

In [None]:
submission_data.score.max()