In [4]:
from google.colab import files
uploaded = files.upload()


Saving real_estate_utah.csv to real_estate_utah.csv


In [5]:
!pip install transformers pandas scikit-learn -q  # Install required libraries (if not already installed)
# If using Kaggle API (assuming kaggle.json is uploaded and Kaggle CLI is configured):
# !kaggle datasets download -d kanchana1990/real-estate-data-utah-2024 -p /content
# !unzip -q /content/real-estate-data-utah-2024.zip -d /content/

import pandas as pd

df = pd.read_csv('real_estate_utah.csv')  # Use the exact name of the uploaded file
print("Total entries:", len(df))
df.head()



Total entries: 4440


Unnamed: 0,type,text,year_built,beds,baths,baths_full,baths_half,garage,lot_sqft,sqft,stories,lastSoldOn,listPrice,status
0,single_family,"Escape to tranquility with this off-grid, unfi...",2020.0,1.0,1.0,1.0,1.0,2.0,71438.0,696.0,2.0,2018-05-31,90000.0,for_sale
1,single_family,Beautiful home in the desirable Oak Hills and ...,1968.0,4.0,3.0,2.0,1.0,2.0,56628.0,3700.0,2.0,2018-05-31,799000.0,for_sale
2,single_family,"Welcome to your new home, nestled in the heart...",1985.0,4.0,3.0,3.0,1.0,1.0,10019.0,3528.0,2.0,2018-05-31,389900.0,for_sale
3,single_family,Investment Opportunity. House needs some work ...,1936.0,4.0,2.0,2.0,1.0,2.0,12632.0,2097.0,2.0,2018-04-16,300000.0,for_sale
4,land,Deer Springs Ranch is an 8000 Ac Ranch in an H...,2003.0,4.0,0.0,2.0,1.0,2.0,872071.0,2400.0,2.0,2018-05-31,70000.0,for_sale


In [6]:
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Clean the text
def clean_text(text):
    text = str(text)
    text = re.sub(r'<[^>]+>', ' ', text)         # remove HTML tags
    text = re.sub(r'\s+', ' ', text).strip()     # normalize whitespace
    return text.lower()                          # lowercase for BERT-uncased

df['text_clean'] = df['text'].apply(clean_text)

# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['type'])

# Shuffle and sample 1400 rows (1000 train, 200 val, 200 test)
df = df.sample(n=1400, random_state=42).reset_index(drop=True)

train_df = df.iloc[:1000]
val_df = df.iloc[1000:1200]
test_df = df.iloc[1200:]

print("Classes:", le.classes_)
print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))


Classes: ['condo' 'condo_townhome' 'condo_townhome_rowhome_coop' 'condos' 'farm'
 'land' 'mobile' 'other' 'single_family' 'townhomes' 'townhouse']
Train/Val/Test sizes: 1000 200 200


In [8]:
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset, DataLoader

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize function
def tokenize_data(text_list, max_len=128):
    return tokenizer(
        list(text_list),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )

# Tokenize each split
train_encodings = tokenize_data(train_df['text_clean'])
val_encodings   = tokenize_data(val_df['text_clean'])
test_encodings  = tokenize_data(test_df['text_clean'])

# Convert labels to tensors
train_labels = torch.tensor(train_df['label'].values)
val_labels   = torch.tensor(val_df['label'].values)
test_labels  = torch.tensor(test_df['label'].values)

# Build TensorDatasets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset   = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
test_dataset  = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Dataloaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size)
test_loader  = DataLoader(test_dataset, batch_size=batch_size)

print("Data tokenized and ready for training ✅")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Data tokenized and ready for training ✅


In [11]:
from transformers import BertForSequenceClassification
from torch.optim import AdamW  # ✅ Correct import
from sklearn.metrics import accuracy_score, f1_score
import torch

# ✅ Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("🖥️ Using device:", device)

# ✅ Number of label classes
num_labels = len(le.classes_)

# ✅ Training function
def train_model(model, optimizer, train_loader, val_loader, epochs=3):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"\n📘 Epoch {epoch+1} | Avg Train Loss: {avg_train_loss:.4f}")

        # ✅ Validation
        model.eval()
        val_preds, val_true = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = [x.to(device) for x in batch]
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())

        acc = accuracy_score(val_true, val_preds)
        f1 = f1_score(val_true, val_preds, average='macro')
        print(f"📊 Val Accuracy: {acc:.4f} | Val F1 Score: {f1:.4f}")

    return model, acc, f1

# ✅ Learning Rate Tuning
learning_rates = [1e-5, 3e-5, 1e-4]
best_f1 = 0
best_model = None
best_lr = None

for lr in learning_rates:
    print(f"\n🔁 Fine-tuning with Learning Rate = {lr}")

    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=lr)

    trained_model, val_acc, val_f1 = train_model(model, optimizer, train_loader, val_loader)

    if val_f1 > best_f1:
        best_f1 = val_f1
        best_model = trained_model
        best_lr = lr

print(f"\n✅ Best Learning Rate: {best_lr} with Val F1 Score: {best_f1:.4f}")


🖥️ Using device: cuda

🔁 Fine-tuning with Learning Rate = 1e-05


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📘 Epoch 1 | Avg Train Loss: 1.4968
📊 Val Accuracy: 0.6850 | Val F1 Score: 0.1162

📘 Epoch 2 | Avg Train Loss: 0.9194
📊 Val Accuracy: 0.8150 | Val F1 Score: 0.2634

📘 Epoch 3 | Avg Train Loss: 0.6655
📊 Val Accuracy: 0.8400 | Val F1 Score: 0.3422

🔁 Fine-tuning with Learning Rate = 3e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📘 Epoch 1 | Avg Train Loss: 1.1875
📊 Val Accuracy: 0.8150 | Val F1 Score: 0.2634

📘 Epoch 2 | Avg Train Loss: 0.6377
📊 Val Accuracy: 0.8400 | Val F1 Score: 0.3551

📘 Epoch 3 | Avg Train Loss: 0.4388
📊 Val Accuracy: 0.8700 | Val F1 Score: 0.4615

🔁 Fine-tuning with Learning Rate = 0.0001


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



📘 Epoch 1 | Avg Train Loss: 1.0769
📊 Val Accuracy: 0.8000 | Val F1 Score: 0.2532

📘 Epoch 2 | Avg Train Loss: 0.6087
📊 Val Accuracy: 0.8500 | Val F1 Score: 0.3617

📘 Epoch 3 | Avg Train Loss: 0.5475
📊 Val Accuracy: 0.8450 | Val F1 Score: 0.3583

✅ Best Learning Rate: 3e-05 with Val F1 Score: 0.4615


In [12]:
# 📊 Final Test Evaluation
best_model.eval()
test_preds, test_true = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = best_model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_true.extend(labels.cpu().numpy())

# Metrics
test_acc = accuracy_score(test_true, test_preds)
test_f1 = f1_score(test_true, test_preds, average='macro')
print(f"🧪 Test Accuracy: {test_acc:.4f}")
print(f"🧪 Test F1 Score: {test_f1:.4f}")


🧪 Test Accuracy: 0.8600
🧪 Test F1 Score: 0.3781


In [13]:
# Map back label names
id2label = {i: label for i, label in enumerate(le.classes_)}

# Attach predictions to test_df
test_df = test_df.reset_index(drop=True)
test_df['predicted_label'] = [id2label[i] for i in test_preds]
test_df['true_label'] = [id2label[i] for i in test_true]

# Misclassified rows
misclassified = test_df[test_df['predicted_label'] != test_df['true_label']]
print(f"❌ Total Misclassified Samples: {len(misclassified)} / {len(test_df)}")

# Preview 5 misclassified examples
misclassified[['text_clean', 'true_label', 'predicted_label']].head()


❌ Total Misclassified Samples: 28 / 200


Unnamed: 0,text_clean,true_label,predicted_label
4,reduced lot rent for the 1st year or 4 months ...,mobile,single_family
11,"starter home, very spacious , great community.",mobile,single_family
28,feel at home in the grace luxury townhome show...,condos,townhomes
30,back on market! move in ready. this home is de...,mobile,single_family
48,enjoy 6 months free rent (if you get approved ...,mobile,single_family


In [14]:
def predict_property_type(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    text = re.sub(r'\s+', ' ', text.lower()).strip()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=128).to(device)

    best_model.eval()
    with torch.no_grad():
        logits = best_model(**inputs).logits
        pred_id = torch.argmax(logits, dim=1).item()
    return id2label[pred_id]

# Test it
sample = "Newly renovated 2-bedroom apartment in a secure community with pool and gym access."
print("🏠 Predicted Property Type:", predict_property_type(sample))


🏠 Predicted Property Type: townhomes


In [15]:
# Save model and tokenizer
best_model.save_pretrained("bert-real-estate-model")
tokenizer.save_pretrained("bert-real-estate-model")


('bert-real-estate-model/tokenizer_config.json',
 'bert-real-estate-model/special_tokens_map.json',
 'bert-real-estate-model/vocab.txt',
 'bert-real-estate-model/added_tokens.json')