In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
import tensorflow as tf
from tqdm import tqdm

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load dataset
df = pd.read_csv('/mnt/data/train.csv')

# Display basic information about the dataset
print("Dataset Preview:")
print(df.head())

# Combine relevant columns into a single text feature
texts = df['category'].astype(str) + ' ' + df['sub category'].astype(str) + ' ' + df['crimeadditional info'].astype(str)

# Target variable (e.g., 'category' column or any column to classify)
targets = df['category']  # Change this to the appropriate column for labels if needed

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    words = nltk.word_tokenize(text)
    
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    
    # Stemming
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words]
    
    return ' '.join(words)

# Apply preprocessing to the text column
texts = texts.apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
targets = label_encoder.fit_transform(targets)

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(texts, targets, test_size=0.2, random_state=42)

# Tokenize using mBERT
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize the text data for mBERT
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=128)

# Convert to TensorFlow dataset
def create_tf_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((
        {key: tf.convert_to_tensor(val) for key, val in encodings.items()},
        tf.convert_to_tensor(labels)
    ))

train_dataset = create_tf_dataset(train_encodings, y_train).shuffle(1000).batch(16)
val_dataset = create_tf_dataset(val_encodings, y_val).batch(16)

# Load the mBERT model
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Compile the model with optimizer and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

# Train the model and save after each epoch with progress display
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    with tqdm(total=len(train_dataset), desc=f"Training Epoch {epoch + 1}") as pbar:
        history = model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=1,
            verbose=0,
            callbacks=[tf.keras.callbacks.LambdaCallback(on_batch_end=lambda batch, logs: pbar.update(1))]
        )
    
    # Display loss after each epoch
    train_loss = history.history['loss'][0]
    val_loss = history.history['val_loss'][0]
    print(f"Epoch {epoch + 1} - Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # Save the model and tokenizer after each epoch
    model.save_pretrained(f'trained_mbert_model_epoch_{epoch + 1}')
    tokenizer.save_pretrained(f'trained_mbert_model_epoch_{epoch + 1}')
    print(f"Model saved after epoch {epoch + 1}")

print("Training complete. Model and tokenizer saved.")



ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Predicting on new data
def predict(text):
    processed_text = preprocess_text(text)
    encoding = tokenizer(processed_text, return_tensors='tf', truncation=True, padding=True, max_length=128)
    logits = model(encoding)[0]
    prediction = np.argmax(logits, axis=-1)
    return label_encoder.inverse_transform(prediction)[0]

# Example usage
example_text = "This is an example complaint to classify."
print("Predicted class:", predict(example_text))
