In [None]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch


In [None]:
df = pd.read_csv('path/to/train.csv', encoding='latin-1')


In [None]:
label_counts = df['target'].value_counts()

positive_count = label_counts.get(1, 0)
negative_count = label_counts.get(0, 0)

print(f"Positive samples: {positive_count}")
print(f"Negative samples: {negative_count}")

Positive samples: 10500
Negative samples: 12498


In [None]:
df.info()



RangeIndex: 22998 entries, 0 to 22997
Data columns (total 5 columns):
 # Column Non-Null Count Dtype 
--- ------ -------------- ----- 
 0 id 7598 non-null float64
 1 keyword 7538 non-null object 
 2 location 5070 non-null object 
 3 text 22998 non-null object 
 4 target 22998 non-null int64 
dtypes: float64(1), int64(1), object(3)
memory usage: 898.5+ KB


In [None]:
#Converting Data to Dataframe and defining X and labels
data_ = pd.DataFrame(df)
x = data_.drop('target', axis=1)
labels = df['target']

In [None]:
x['text'] = x['text'].str.replace(r'http\S+|www\S+|@\S+', '', regex=True)
x['hashtag'] = x['text'].str.findall(r'#(\w+)').apply(lambda hashtags: ', '.join(hashtags) if hashtags else np.nan)
x['text'] = x['text'].str.replace('#', '', regex=False)
x['text_'] = x['text'].str.replace("[.,!]", "", regex=True)
x['text_'] = x['text_'].str.lower()
x['hashtag'] = x['hashtag'].str.lower()
x=x.drop(['id','text','location'],axis=1)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
 words = nltk.word_tokenize(text.lower()) # Tokenize and convert to lowercase
 words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words] # Use str.isalpha()
 return " ".join(words)


In [None]:
x['text_'] = x['text_'].apply(preprocess_text)


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [None]:
import re

def extract_keyword_using_embeddings(text):
 # Ensure the text is valid
 if not text or not isinstance(text, str) or not text.strip():
 return None # or return an appropriate fallback value

 # Clean the text to retain only words
 text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
 text = text.strip() # Remove leading/trailing whitespace

 # Get the embeddings for the entire tweet
 tweet_embedding = model.encode([text])[0]

 # Split the tweet into words and get their embeddings
 words = text.split()
 if not words: # Handle cases where splitting results in no valid words
 return None # or return a fallback value

 word_embeddings = model.encode(words)

 # Calculate the cosine similarity between the tweet embedding and each word embedding
 similarities = cosine_similarity([tweet_embedding], word_embeddings)

 # Find the index of the word with the highest similarity
 best_word_index = np.argmax(similarities)

 # Return the word that has the highest similarity to the tweet
 return words[best_word_index]


In [None]:
#2 - Replacing all rows
x['keyword'] = x['keyword'].replace('', pd.NA)

# Replace all values in the 'keyword' column with embedding-generated keywords
x['keyword'] = x['text_'].apply(lambda text: extract_keyword_using_embeddings(text))

In [None]:
pip install datasets



In [None]:
from datasets import Dataset
x['labels']=labels
# Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(data_)

In [None]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(x, test_size=0.2, shuffle=True, random_state=42)


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments


In [None]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")


In [None]:
def tokenize_function(example):
 return tokenizer(example["text_"], padding="max_length", truncation=True)


In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)


In [None]:
train_dataset.set_format("torch")
val_dataset.set_format("torch")


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=2, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#Freeze the first 6 layers of RoBERTa (BERTweet)
for i, layer in enumerate(model.roberta.encoder.layer):
 if i < 6: # Adjust number of layers to freeze as needed
 for param in layer.parameters():
 param.requires_grad = False


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
 """
 Compute evaluation metrics for the model's predictions.
 """
 labels = pred.label_ids
 preds = pred.predictions.argmax(-1) # For classification models with logits
 accuracy = accuracy_score(labels, preds)
 precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') # For binary classification
 return {
 'accuracy': accuracy,
 'precision': precision,
 'recall': recall,
 'f1': f1
 }

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# Define your training arguments
training_args = TrainingArguments(
 output_dir="./results", # Output directory for model checkpoints
 evaluation_strategy="epoch", # Evaluate at the end of each epoch
 save_strategy="epoch", # Save the model after each epoch
 learning_rate=1e-5,
 max_grad_norm=1.0,
 per_device_train_batch_size=24, # Training batch size
 per_device_eval_batch_size=32, # Validation batch size
 num_train_epochs=12, # Total number of epochs
 weight_decay=0.03, # Weight decay for regularization
 save_total_limit=1, # Save only the last checkpoint
 logging_dir="./logs", # Directory for logs
 logging_steps=10,
 warmup_steps=500,
 metric_for_best_model='eval_loss',
 greater_is_better=False,
 load_best_model_at_end=True,
 lr_scheduler_type="linear"
)

# Add EarlyStoppingCallback to the Trainer
early_stopping = EarlyStoppingCallback(
 early_stopping_patience=3 # Number of epochs to wait for improvement in validation loss
)

# Initialize the Trainer with early stopping callback
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=val_dataset,
 tokenizer=tokenizer,
 callbacks=[early_stopping], # Add early stopping here
 compute_metrics=compute_metrics
)


 trainer = Trainer(


In [None]:
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Validation Results:", results)


Validation Results: {'eval_loss': 0.280660480260849, 'eval_accuracy': 0.9106521739130434, 'eval_precision': 0.8946378174976481, 'eval_recall': 0.9104834849210148, 'eval_f1': 0.902491103202847, 'eval_runtime': 6.4811, 'eval_samples_per_second': 709.757, 'eval_steps_per_second': 22.218, 'epoch': 9.0}


In [None]:
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define paths
output_dir = "./results"
os.makedirs(output_dir, exist_ok=True)

# Save model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Model and tokenizer saved to ./results


In [None]:
import pandas as pd
import torch
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import re
import matplotlib.pyplot as plt

# Function to detect past events
def is_past_event(text):
 past_time_patterns = [
 r'\b\d+\s+years? ago\b',
 r'\b\d+\s+months? ago\b',
 r'last\s+\w+', # e.g., last week, last year
 r'\bin\s+\d{4}\b', # mentions a specific year
 r'\bformer\b', # Indicates past events
 ]
 for pattern in past_time_patterns:
 if re.search(pattern, text, flags=re.IGNORECASE):
 return True
 return False

# Function to adjust predictions
def adjust_predictions(text, prediction):
 if is_past_event(text) and prediction == "Positive":
 return "Negative"
 return prediction

# Load dataset
text_data = pd.read_csv('path/to/final_test.csv')

# Ensure the dataset has the necessary columns
assert 'tweet' in text_data.columns, "Column 'tweet' not found in dataset"
assert 'label' in text_data.columns, "Column 'label' not found in dataset"

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize inputs
inputs = tokenizer(
 list(text_data['tweet']),
 padding=True,
 truncation=True,
 return_tensors="pt"
).to(device)

# Get predictions
with torch.no_grad():
 outputs = model(**inputs)
 logits = outputs.logits

predictions = torch.argmax(logits, dim=1)

# Map class indices to labels
class_labels = ["Negative", "Positive"] # Update as per your model's label mapping
predicted_labels = [class_labels[pred] for pred in predictions.cpu().numpy()]

# Adjust predictions
adjusted_predictions = [
 adjust_predictions(tweet, pred) for tweet, pred in zip(text_data['tweet'], predicted_labels)
]

# Convert true labels to string format for comparison
true_labels = text_data['label'].map({0: "Negative", 1: "Positive"}).tolist()

# Calculate metrics
precision = precision_score(true_labels, adjusted_predictions, pos_label="Positive", average='binary')
recall = recall_score(true_labels, adjusted_predictions, pos_label="Positive", average='binary')
accuracy = accuracy_score(true_labels, adjusted_predictions)
f1 = f1_score(true_labels, adjusted_predictions, pos_label="Positive", average='binary')

# Display results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

# Plot the metrics as bar charts
metrics = [precision, recall, accuracy, f1]
metric_names = ['Precision', 'Recall', 'Accuracy', 'F1 Score']

misclassifications = []
for tweet, true_label, pred_label in zip(text_data['tweet'], true_labels, adjusted_predictions):
 if true_label != pred_label:
 misclassifications.append((tweet, true_label, pred_label))

# Display misclassifications
if misclassifications:
 print("\nMisclassified Tweets:")
 for i, (tweet, true_label, pred_label) in enumerate(misclassifications):
 print(f"\n{i+1}. Tweet: {tweet}")
 print(f"True Label: {true_label}, Predicted Label: {pred_label}")
else:
 print("\nNo misclassifications found.")

Precision: 0.9317
Recall: 0.9983
Accuracy: 0.9637
F1 Score: 0.9639

Misclassified Tweets:

True Label: Positive, Predicted Label: Negative

2. Tweet: Lost my phone in the parking lot today. Luckily, someone turned it in! #PhoneLost #Thankful
True Label: Negative, Predicted Label: Positive

3. Tweet: Dropped my favorite mug this morning. RIP to the best coffee mug ever. #AccidentProne #MugDisaster
True Label: Negative, Predicted Label: Positive

4. Tweet: Slight delay getting to work because of some traffic. Hopefully, this won’t be a habit! #TrafficTrouble #MorningCommute
True Label: Negative, Predicted Label: Positive

5. Tweet: Locked myself out of the house today. The locksmith was very kind though! #HouseTroubles #ForgotTheKeys
True Label: Negative, Predicted Label: Positive

6. Tweet: Lost my favorite jacket in the restaurant. I hope someone finds it! #LostAndFound #JacketWoes
True Label: Negative, Predicted Label: Positive

7. Tweet: Someone cut me off in traffic this morning. I 