<a href="https://colab.research.google.com/github/shoond/AIEarthHack_TeamTerminal/blob/main/AI_Earthhack_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset




In [None]:
df_90 = pd.read_csv('https://raw.githubusercontent.com/shoond/portfolio/datasets/AI%20EarthHack%20Dataset_SA90.csv', encoding='Latin-1')

In [None]:
#Fine-tune BERT using cleaned up AI EarthHach dataset

# Tokenize 'solution' text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=512, truncation=True) for text in df_90['solution']]

# Pad sequences to the same length
max_len = max([len(text) for text in tokenized_texts])
padded_texts = torch.tensor([text + [0]*(max_len-len(text)) for text in tokenized_texts])

# Convert labels (sentiment scores) to tensor
labels = torch.tensor(df_90['solution_sentiment'].tolist())

# Split data into train and test sets 70/30.
train_inputs, test_inputs, train_labels, test_labels = train_test_split(padded_texts, labels, random_state=42, test_size=0.3)

# Create DataLoader for train and test sets
train_data = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_data, batch_size=4, shuffle=True)

test_data = TensorDataset(test_inputs, test_labels)
test_dataloader = DataLoader(test_data, batch_size=4)

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
model.cuda()  # Use GPU if available

# Set optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=2e-5)

# Fine-tuning the model
model.train()
for epoch in range(5):  # Train for 5 epochs (you can adjust this)
    for batch in train_dataloader:
        inputs, labels = batch
        inputs = inputs.cuda()
        labels = labels.float().unsqueeze(1).cuda()  # Assuming binary sentiment classification

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average Accuracy on Test Set: 0.0


In [None]:
# Assuming model and tokenizer are your fine-tuned BERT model and tokenizer instances
#save to local colab environment for later use
model.save_pretrained('/content/')  # Save model weights and configuration
tokenizer.save_pretrained('/content/')  # Save tokenizer


('/content/tokenizer_config.json',
 '/content/special_tokens_map.json',
 '/content/vocab.txt',
 '/content/added_tokens.json')

In [None]:
#Run model against new dataset
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from google.colab import files

def load_model(model_path):
    model = BertForSequenceClassification.from_pretrained('/content/')
    return model

def preprocess_text(text, tokenizer):
    tokenized_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        truncation=True
    )
    return tokenized_text

def predict_sentiment(text, model, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    tokenized_text = preprocess_text(text, tokenizer)

    input_ids = torch.tensor([tokenized_text['input_ids']]).to(device)
    attention_mask = torch.tensor([tokenized_text['attention_mask']]).to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_label = torch.sigmoid(logits).item()

    return predicted_label


def read_csv(file_path):
    # Read the CSV file and drop rows with NaN values in 'problem' or 'solution' columns
    data = pd.read_csv(file_path, encoding='Latin-1').dropna(subset=['problem', 'solution'])
    return data


def main():
    # Load the fine-tuned model
    model_path = '/content'
    model = load_model(model_path)

    # Load the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Read CSV file
    csv_file_path = 'https://raw.githubusercontent.com/shoond/portfolio/datasets/AI%20EarthHack%20Dataset.csv'
    data_to_test = read_csv(csv_file_path)

    # Perform sentiment analysis on each row
    for index, row in data_to_test.iterrows():
        text = row['solution']  # Assuming the column name is 'solution'

        # Predict sentiment
        predicted_sentiment = predict_sentiment(text, model, tokenizer)

        # Update the DataFrame with the predicted sentiment score
        data_to_test.at[index, 'predicted_sentiment'] = predicted_sentiment

        ##print(f"Row {index + 1} - Predicted Sentiment Score: {predicted_sentiment}")

    # Saving the updated DataFrame to a new CSV locally
    updated_csv_filename = 'Sentiment_Results.csv'  # Modify the filename as needed
    data_to_test.to_csv(updated_csv_filename, index=False)
    # Download the CSV file to your local machine
    files.download('Sentiment_Results.csv')
    print(f"Updated DataFrame and saved as {updated_csv_filename} to downloads folder.")


if __name__ == "__main__":
    main()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Updated DataFrame and saved as Sentiment_Results.csv to downloads folder.
