In [49]:
import pandas as pd

# Load the data from the provided Excel file
data = pd.read_excel('/kaggle/input/new-weed-tweets-with-bert-xlsx/new_weed_tweets_with_bert .xlsx')
data.head()

Unnamed: 0,new_weed_tweets_with_bert,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,Key word,Post Date,Username,Account id,account page,Post content,Post id,# of replies,# of retweets,# of likes,...,tweet link,# of follower,# of following,Geo location,County,State,Cleaned_Content,VADER_Sentiment_Score,BERT_Sentiment,Manual Annotation
1,weed,2020-12-31 07:01:21,Tanya Martin,geekgoodgirl,https://twitter.com/geekgoodgirl,Breakfast. Late. Need cash ðŸ¤‘ and weed. https:/...,1344418323627460096,0,0,0,...,https://twitter.com/Olympics/status/1344418323...,884,785,"Glenn, California",Glenn,California,breakfast late need cash and weed,0,3 stars,Positive
2,weed,2020-12-31 06:59:42,Alice Moon,thealicemoon,https://twitter.com/thealicemoon,@JasonRBradwell @ThatChristinaG Cannabis brand...,1344417909787879936,0,0,0,...,https://twitter.com/Olympics/status/1344417909...,6557,1508,"Los Angeles, California",Los Angeles,California,jasonrbradwell thatchristinag cannabis brands ...,-0.6249,1 star,Negative
3,weed,2020-12-31 06:52:30,Will Cunningham,wcunningham11,https://twitter.com/wcunningham11,@RyanAFournier But are you going to take it ba...,1344416096628000000,0,0,0,...,https://twitter.com/Olympics/status/1344416096...,16,169,"Shasta, California",Shasta,California,ryanafournier but are you going to take it bac...,0.3724,3 stars,Neutral
4,weed,2020-12-31 06:35:00,420,420,https://twitter.com/420,I'm old enough to remember when the headline w...,1344411692969540096,0,0,0,...,https://twitter.com/Olympics/status/1344411692...,112764,1034,"San Francisco, California",San Francisco,California,im old enough to remember when the headline wa...,0.8516,5 stars,Negative


In [50]:
# Set the first row as the header
data.columns = data.iloc[0]
data = data.drop(0)

# Extract relevant columns
relevant_data = data[["VADER_Sentiment_Score", "BERT_Sentiment", "Manual Annotation"]].copy()

# Convert VADER_Sentiment_Score to numeric
relevant_data["VADER_Sentiment_Score"] = pd.to_numeric(relevant_data["VADER_Sentiment_Score"], errors='coerce')

relevant_data.head()

Unnamed: 0,VADER_Sentiment_Score,BERT_Sentiment,Manual Annotation
1,0.0,3 stars,Positive
2,-0.6249,1 star,Negative
3,0.3724,3 stars,Neutral
4,0.8516,5 stars,Negative
5,0.802,5 stars,Positive


In [51]:
# Assign sentiment labels based on the VADER_Sentiment_Score
relevant_data["VADER_Label"] = "Neutral"
relevant_data.loc[relevant_data["VADER_Sentiment_Score"] > 0.05, "VADER_Label"] = "Positive"
relevant_data.loc[relevant_data["VADER_Sentiment_Score"] < -0.05, "VADER_Label"] = "Negative"

# Assign sentiment labels based on the BERT_Sentiment
bert_mapping = {"1 star": "Negative", "2 stars": "Negative",
                "3 stars": "Neutral", "4 stars": "Positive", "5 stars": "Positive"}
relevant_data["BERT_Label"] = relevant_data["BERT_Sentiment"].map(bert_mapping)

relevant_data[["VADER_Sentiment_Score", "VADER_Label", "BERT_Sentiment", "BERT_Label", "Manual Annotation"]].head()

Unnamed: 0,VADER_Sentiment_Score,VADER_Label,BERT_Sentiment,BERT_Label,Manual Annotation
1,0.0,Neutral,3 stars,Neutral,Positive
2,-0.6249,Negative,1 star,Negative,Negative
3,0.3724,Positive,3 stars,Neutral,Neutral
4,0.8516,Positive,5 stars,Positive,Negative
5,0.802,Positive,5 stars,Positive,Positive


In [52]:
from sklearn.metrics import accuracy_score

# Calculate accuracy for VADER and BERT
vader_accuracy = accuracy_score(relevant_data["Manual Annotation"], relevant_data["VADER_Label"])
bert_accuracy = accuracy_score(relevant_data["Manual Annotation"], relevant_data["BERT_Label"])

vader_accuracy, bert_accuracy

(0.46546546546546547, 0.3843843843843844)

Improve Accuracy

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
from sklearn.metrics import accuracy_score

In [55]:
# Extract relevant columns from the original dataset
X = data["Cleaned_Content"]
y = relevant_data["Manual Annotation"]

# Split data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_val.shape

((799,), (200,))

In [56]:
'''
!pip install transformers
!pip install torch
!pip install tqdm
!pip install sklearn
'''

'\n!pip install transformers\n!pip install torch\n!pip install tqdm\n!pip install sklearn\n'

In [57]:

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import Trainer, TrainingArguments

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 1. Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create a mapping for sentiment labels to integers
label_mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}
y_train_mapped = [label_mapping[label] for label in y_train]
y_val_mapped = [label_mapping[label] for label in y_val]

train_dataset = SentimentDataset(X_train.tolist(), y_train_mapped, tokenizer)
val_dataset = SentimentDataset(X_val.tolist(), y_val_mapped, tokenizer)


In [58]:
'''
# Initialize a BERT model for sequence classification with three labels (Negative, Neutral, Positive)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
'''

"\n# Initialize a BERT model for sequence classification with three labels (Negative, Neutral, Positive)\nmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)\n"

In [59]:
#!pip install transformers[torch]

In [60]:
#!pip install accelerate

In [61]:
!pip install accelerate -U



In [62]:
'''
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Define a function to compute metrics (accuracy in our case)
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy,
    }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()
'''

'\n# Set up training arguments\ntraining_args = TrainingArguments(\n    output_dir=\'./results\',\n    num_train_epochs=5,\n    per_device_train_batch_size=8,\n    per_device_eval_batch_size=8,\n    logging_dir=\'./logs\',\n    logging_steps=10,\n    evaluation_strategy="epoch",\n    save_strategy="epoch",\n    load_best_model_at_end=True,\n    metric_for_best_model="accuracy",\n)\n\n# Define a function to compute metrics (accuracy in our case)\nfrom sklearn.metrics import accuracy_score\n\ndef compute_metrics(pred):\n    labels = pred.label_ids\n    preds = pred.predictions.argmax(-1)\n    accuracy = accuracy_score(labels, preds)\n    return {\n        \'accuracy\': accuracy,\n    }\n\n# Initialize the Trainer\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=train_dataset,\n    eval_dataset=val_dataset,\n    compute_metrics=compute_metrics,\n    tokenizer=tokenizer,\n)\n\n# Train the model\ntrainer.train()\n'

In [63]:
#trainer.save_model('final')

In [64]:
'''
# Evaluate the model on the validation set
results = trainer.evaluate()

print(results)
'''

'\n# Evaluate the model on the validation set\nresults = trainer.evaluate()\n\nprint(results)\n'

In [65]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("/kaggle/working/results/checkpoint-500")
model = BertForSequenceClassification.from_pretrained("/kaggle/working/results/checkpoint-500")

# Load the dataset
new_data = pd.read_csv('/kaggle/input/new-weed-tweets-with-bert-csv/new_weed_tweets_with_bert.csv')
'''
# Ensure the content is in string format
new_data['Cleaned_Content'] = new_data['Cleaned_Content'].astype(str)

# Tokenize the content
inputs = tokenizer(new_data['Cleaned_Content'].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)

# Add predictions to the dataframe
new_data['Predictions'] = predictions.tolist()

# Save or display the results
new_data.to_csv('path_to_save_predictions.csv', index=False)
'''

'\n# Ensure the content is in string format\nnew_data[\'Cleaned_Content\'] = new_data[\'Cleaned_Content\'].astype(str)\n\n# Tokenize the content\ninputs = tokenizer(new_data[\'Cleaned_Content\'].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)\n\n# Make predictions\nwith torch.no_grad():\n    outputs = model(**inputs)\n    predictions = torch.argmax(outputs.logits, dim=1)\n\n# Add predictions to the dataframe\nnew_data[\'Predictions\'] = predictions.tolist()\n\n# Save or display the results\nnew_data.to_csv(\'path_to_save_predictions.csv\', index=False)\n'

In [68]:
chunk_size = 10  # Adjust based on your memory capacity
num_chunks = len(new_data) // chunk_size + (1 if len(new_data) % chunk_size != 0 else 0)
all_predictions = []

# Process in chunks
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size
    data_chunk = new_data.iloc[start_idx:end_idx].copy()  # Here's the change
    
    # Ensure the content is in string format
    data_chunk['Cleaned_Content'] = data_chunk['Cleaned_Content'].astype(str)
    
    # Tokenize the content
    inputs = tokenizer(data_chunk['Cleaned_Content'].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(predictions.tolist())

# Add predictions to the dataframe
new_data['Predictions'] = all_predictions

In [70]:
new_data.to_csv('new_sentiment_score')

In [71]:
# Save the DataFrame to a CSV file
new_data.to_csv('new_weed_tweets_with_predictions.csv', index=False)


In [66]:
chunk_size = 10  # Adjust based on your memory capacity
num_chunks = len(new_data) // chunk_size + (1 if len(new_data) % chunk_size != 0 else 0)
all_predictions = []

# Process in chunks
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size
    data_chunk = new_data.iloc[start_idx:end_idx]
    
    # Ensure the content is in string format
    data_chunk['Cleaned_Content'] = data_chunk['Cleaned_Content'].astype(str)
    
    # Tokenize the content
    inputs = tokenizer(data_chunk['Cleaned_Content'].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(predictions.tolist())

# Add predictions to the dataframe
new_data['Predictions'] = all_predictions


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_chunk['Cleaned_Content'] = data_chunk['Cleaned_Content'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_chunk['Cleaned_Content'] = data_chunk['Cleaned_Content'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_chunk['Cleaned_Content'] = data_chunk['Cleaned_Conte

KeyboardInterrupt: 

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained("./results")  # Assuming model is saved in "results" directory
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def predict_sentiment(text):
    # Tokenize the input text
    # Ensure the content is in string format
new_data['Cleaned_Content'] = new_data['Cleaned_Content'].astype(str)

# Tokenize the content
inputs = tokenizer(new_data['Cleaned_Content'].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted class
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    return predicted_class

# Example usage
sentiment = predict_sentiment("I love this product!")
print(f"The predicted sentiment is: {sentiment}")
