In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

data=pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
data2 = pd.read_csv('/kaggle/input/disasters-on-social-media/socialmedia-disaster-tweets-DFE.csv')[["keyword","location","text","choose_one"]]
data2.rename(columns={"choose_one":"target"},inplace=True)
data2["target"] = (data2["target"] == "Relevant").astype("int")

In [None]:
print(data.columns)
print(data2.columns)

# Concatenating two datsets 

In [None]:
import pandas as pd

# Assuming data and data2 are already loaded as pandas DataFrames

# Selecting only 'text' and 'target' columns from both dataframes
data = pd.concat([data[['text', 'target']], data2[['text', 'target']]], ignore_index=True)

# Display the first few rows
data


In [None]:
print(data.isnull().sum())  # Check for missing values

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from sklearn.model_selection import train_test_split
import re
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)  # Removes anything starting with 'http' or 'www'

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = word_tokenize(text)

    # Join the tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the 'text' column
data['processed_text'] = data['text'].apply(preprocess_text)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['processed_text'], data['target'], test_size=0.2, random_state=42)

# Display the processed data
X_train.head()

In [None]:
import pandas as pd  

# Assuming y contains class labels
print(y_train.value_counts())  

# To get percentages
print(y_train.value_counts(normalize=True) * 100)


In [None]:
X_train[:1]

In [None]:
len(X_train[:1])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model_tfidf = LogisticRegression()
model_tfidf.fit(X_train_tfidf, y_train)

# Evaluate the model
y_pred = model_tfidf.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
import pandas as pd

# Find misclassified indices
misclassified_indices = [i for i, (true, pred) in enumerate(zip(y_test, y_pred)) if true != pred]

# Extract misclassified examples
misclassified_data = pd.DataFrame({
    "Text": X_test.iloc[misclassified_indices],
    "True Label": y_test.iloc[misclassified_indices],
    "Predicted Label": y_pred[misclassified_indices]
})

# Display misclassified examples
print("\nMisclassified Examples:")
print(misclassified_data.head(10))  # Show first 10 misclassified examples

# Analyze feature importance
feature_names = vectorizer.get_feature_names_out()
coef = model_tfidf.coef_[0]  # Coefficients of logistic regression

# Find top positive and negative words
top_positive_words = [feature_names[i] for i in coef.argsort()[-10:]]  # 10 words most strongly predicting positive class
top_negative_words = [feature_names[i] for i in coef.argsort()[:10]]   # 10 words most strongly predicting negative class

print("\nTop Words Indicating Positive Class:", top_positive_words)
print("Top Words Indicating Negative Class:", top_negative_words)


# Reasons for misclassification
1. Loss of Context & Word Order
2. Failure to Capture Semantic Meaning
3. Presence of Ambiguous or Common Words

In [None]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK data (if not already downloaded)
nltk.download('punkt')

# Tokenize the text
X_train_tokens = [word_tokenize(text) for text in X_train]
X_test_tokens = [word_tokenize(text) for text in X_test]

from gensim.models import KeyedVectors

# Load pre-trained Word2Vec embeddings (e.g., Google News)
word2vec_model = KeyedVectors.load_word2vec_format('/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin', binary=True)

# Save the model as KeyedVectors
word2vec_model.save("word2vec.model")

# Load the model as KeyedVectors
word2vec_model = KeyedVectors.load("word2vec.model", mmap='r')

In [None]:
import numpy as np

def sentence_vector(tokens, model, vector_size=300):  # Set vector_size to match the embedding dimension 
    vec = np.zeros(vector_size)  # Initialize a vector of zeros
    count = 0  # Count of words with valid vectors
    for word in tokens:
        if word in model:  # Check if the word is in the KeyedVectors vocabulary
            vec += model[word]  # Add the word vector
            count += 1
    if count != 0:
        vec /= count  # Average the vectors
    return vec

# Convert tokenized sentences to sentence vectors
X_train_vectors = np.array([sentence_vector(tokens, word2vec_model) for tokens in X_train_tokens])
X_test_vectors = np.array([sentence_vector(tokens, word2vec_model) for tokens in X_test_tokens])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model_w2v = LogisticRegression()
model_w2v.fit(X_train_vectors, y_train)

# Evaluate the model
y_pred_w2v = model_w2v.predict(X_test_vectors)
print("Accuracy:", accuracy_score(y_test, y_pred_w2v))
print("Classification Report:\n", classification_report(y_test, y_pred_w2v))

In [None]:
# Reset index to ensure consistent integer indexing
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Ensure predictions are also converted to arrays if needed
y_pred = np.array(y_pred)
y_pred_w2v = np.array(y_pred_w2v)

# Debugging: Check what indices exist in X_test
valid_indices = set(X_test.index)
i=0
# Print the misclassified samples with their true labels and predictions
for idx in wrong_tfidf_correct_w2v:
    if(i>2): break
    if idx in valid_indices:
        print(f"Sample Sentence: {X_test.iloc[idx]}")
        print(f"True Label: {y_test.iloc[idx]}, TF-IDF Prediction: {y_pred[idx]}, W2V Prediction: {y_pred_w2v[idx]}\n")
    else:
        print(f"Warning: Index {idx} not found in X_test. Skipping...")
    i+=1

In [None]:
import pandas as pd
import numpy as np

# Create a DataFrame for easier analysis
misclassified_df = pd.DataFrame({
    'Text': X_test, 
    'True Label': y_test, 
    'Predicted Label': y_pred
})

# Filter only misclassified examples
misclassified_df = misclassified_df[misclassified_df['True Label'] != misclassified_df['Predicted Label']]

# Display misclassified examples
print("Misclassified Examples:")
print(misclassified_df)


# Reasons for misclassification
1. Failure to Capture Sentence Context 
2. Vocabulary Coverage & OOV (Out-of-Vocabulary) Words
3. Does Not Account for Sentence Structure

In [None]:
!!pip install wandb


In [None]:
import wandb
wandb.login(key="4bf281fc3d5b1a088a9793a64f35253a6c2d7bc6")


In [None]:
!pip install transformers datasets scikit-learn


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)  # Move model to GPU if available

# Function to extract BERT embeddings using batch processing
def get_bert_embeddings(text_list, batch_size=16):
    embeddings = []
    
    # Ensure all inputs are strings
    text_list = [str(text) for text in text_list]

    for i in tqdm(range(0, len(text_list), batch_size), desc="Extracting BERT embeddings"):
        batch_texts = text_list[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Get CLS token
        embeddings.append(batch_embeddings)
    
    return np.vstack(embeddings)  # Stack all embeddings into a single array

# Convert data to lists of strings
X_train = [str(x) for x in X_train]  # Convert each element to string
X_test = [str(x) for x in X_test]    # Convert each element to string

# Extract BERT embeddings
X_train_embedded = get_bert_embeddings(X_train, batch_size=16)
X_test_embedded = get_bert_embeddings(X_test, batch_size=16)

# Train Logistic Regression Model
logreg = LogisticRegression(max_iter=500)
logreg.fit(X_train_embedded, y_train)

# Make pre_dictions
y_pred_bemb = logreg.predict(X_test_embedded)

# Convert y_test to NumPy array to avoid KeyError
y_test = np.array(y_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred_bemb)
print(f"Test Accuracy: {accuracy:.4f}")


In [None]:
# Make pre_dictions
y_pred_bemb = logreg.predict(X_test_embedded)

# Convert y_test to NumPy array to avoid KeyError
y_test = np.array(y_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred_bemb)
print(f"Test Accuracy: {accuracy:.4f}")

# Convert predictions and y_test to NumPy arrays if not already
y_pred_w2v = np.array(y_pred_w2v)
y_pred_bemb = np.array(y_pred_bemb)
y_test = np.array(y_test)

# Find indices where Word2Vec was incorrect, but Logistic Regression was correct
wrong_w2v_correct_bemb = [
    idx for idx in range(len(y_test)) if (y_pred_w2v[idx] != y_test[idx]) and (y_pred_bemb[idx] == y_test[idx])
]

# Print the misclassified samples
print("Samples misclassified by Word2Vec but classified correctly by Logistic Regression:\n")
for idx in wrong_w2v_correct_bemb[:5]:  # Limit output to 5 samples for readability
    print(f"Sample Sentence: {X_test[idx]}")  # Use direct indexing
    print(f"True Label: {y_test[idx]}, W2V Prediction: {y_pred_w2v[idx]}, BERT/Embeddings Prediction: {y_pred_bemb[idx]}\n")



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd

# Define accuracy metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Ensure X_train and X_test are lists of strings
X_train = [str(text) for text in X_train]
X_test = [str(text) for text in X_test]

# Tokenize the data
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=512)

# Ensure labels are lists
y_train = list(y_train)
y_test = list(y_test)

# Custom PyTorch dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Convert labels to LongTensor
        return item
    
    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = Dataset(train_encodings, y_train)
test_dataset = Dataset(test_encodings, y_test)


model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Use accuracy to pick best model
    greater_is_better=True,  # Higher accuracy is better
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,  # Add accuracy metric
)

# Train and evaluate the model
trainer.train()
trainer.evaluate()

# Save the trained model and tokenizer
model.save_pretrained("./bert_text_classifier")
tokenizer.save_pretrained("./bert_text_classifier")



In [None]:
# Get model predictions
predictions = trainer.predict(test_dataset).predictions
predicted_labels = np.argmax(predictions, axis=-1)

# Identify misclassified samples
misclassified_indices = np.where(predicted_labels != np.array(y_test))[0]

print("\nMisclassified Samples:")
for idx in misclassified_indices:
    print("-" * 50)
    print(f"Sample {idx+1}:")
    print(f"  Text: {X_test[idx]}")
    print(f"  True Label: {y_test[idx]}")
    print(f"  Predicted Label: {predicted_labels[idx]}")
print("-" * 50)
