In [None]:
import torch

In [None]:
import torch.nn as nn
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_loc = "/content/drive/MyDrive/kmeans_binary_labeled_helpfulness_data_final.csv"

In [None]:
df = pd.read_csv(dataset_loc)

In [None]:
df.head()

Unnamed: 0,apartmentName,url,rating,review,zip,propertyManager,minRent,maxRent,minSqft,maxSqft,avgRating,helpfulness,days_since_review,state,city,calculated_score,kmeans_label
0,Union 505,https://www.apartments.com/union-505-albuquerq...,5.0,"A very welcoming community, convenient locatio...",NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,0,67.0,NM,ALBUQUERQUE,0.0,0
1,Union 505,https://www.apartments.com/union-505-albuquerq...,3.0,Union 505 is in a great location for Albuquerq...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,0,71.0,NM,ALBUQUERQUE,0.0,0
2,Union 505,https://www.apartments.com/union-505-albuquerq...,4.0,My Experience here at union505 is good only th...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,2,124.0,NM,ALBUQUERQUE,1.0,1
3,Union 505,https://www.apartments.com/union-505-albuquerq...,4.0,I love the apartment. Its very cozy and straig...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,2,129.0,NM,ALBUQUERQUE,1.0,1
4,Union 505,https://www.apartments.com/union-505-albuquerq...,4.0,The apartments are very spacious. The leasing ...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,2,132.0,NM,ALBUQUERQUE,1.0,1


In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_features = hot_encoder.fit_transform(df[['state']])
cat_feature_names = hot_encoder.get_feature_names_out(['state'])

In [None]:
cat_df = pd.DataFrame(cat_features, columns=cat_feature_names)
df_combined = pd.concat([df, cat_df], axis=1)

In [None]:
df_combined.head()

Unnamed: 0,apartmentName,url,rating,review,zip,propertyManager,minRent,maxRent,minSqft,maxSqft,...,state_OK,state_OR,state_SC,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI
0,Union 505,https://www.apartments.com/union-505-albuquerq...,5.0,"A very welcoming community, convenient locatio...",NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Union 505,https://www.apartments.com/union-505-albuquerq...,3.0,Union 505 is in a great location for Albuquerq...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Union 505,https://www.apartments.com/union-505-albuquerq...,4.0,My Experience here at union505 is good only th...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Union 505,https://www.apartments.com/union-505-albuquerq...,4.0,I love the apartment. Its very cozy and straig...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Union 505,https://www.apartments.com/union-505-albuquerq...,4.0,The apartments are very spacious. The leasing ...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_df, test_df = train_test_split(df_combined, test_size=0.2, random_state=42)

In [None]:
train_encodings = tokenizer(list(train_df['review']), truncation=True, padding='max_length', max_length=256)
test_encodings = tokenizer(list(test_df['review']), truncation=True, padding='max_length', max_length=256)

In [None]:
train_extra_feats = torch.tensor(train_df[cat_feature_names].values, dtype=torch.float32)
test_extra_feats = torch.tensor(test_df[cat_feature_names].values, dtype=torch.float32)

In [None]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, extra_feats, labels):
        self.encodings = encodings
        self.extra_feats = extra_feats
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['extra_feats'] = self.extra_feats[idx]
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = ReviewDataset(train_encodings, train_extra_feats, train_df['kmeans_label'].tolist())
test_dataset = ReviewDataset(test_encodings, test_extra_feats, test_df['kmeans_label'].tolist())

In [None]:
class BertWithExtraFeatures(nn.Module):
    def __init__(self, num_labels, extra_feat_dim):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Sequential(
            nn.Linear(768 + extra_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, extra_feats, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        combined = torch.cat((cls_output, extra_feats), dim=1)
        logits = self.classifier(self.dropout(combined))

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return {'loss': loss, 'logits': logits}

In [None]:
extra_feat_dim = train_extra_feats.shape[1]
model = BertWithExtraFeatures(num_labels=2, extra_feat_dim=extra_feat_dim)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    extra_feats = torch.stack([item['extra_feats'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'extra_feats': extra_feats,
        'labels': labels
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=collate_fn
)

In [None]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkars1[0m ([33mkars1-university-of-nevada-las-vegas[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.6814
20,0.6944
30,0.7334
40,0.7057
50,0.6972
60,0.7213
70,0.6926
80,0.6915
90,0.7056
100,0.6672


TrainOutput(global_step=12213, training_loss=0.6540850642505778, metrics={'train_runtime': 6039.676, 'train_samples_per_second': 16.176, 'train_steps_per_second': 2.022, 'total_flos': 0.0, 'train_loss': 0.6540850642505778, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(axis=-1)
print(classification_report(test_df['kmeans_label'].tolist(), preds))

              precision    recall  f1-score   support

           0       0.74      0.60      0.66      4139
           1       0.65      0.78      0.71      4003

    accuracy                           0.69      8142
   macro avg       0.69      0.69      0.69      8142
weighted avg       0.70      0.69      0.69      8142



In [None]:
# Example DataFrame with 25 categories
df = pd.DataFrame({
    'text': ["Great place", "Not clean", "Loved it", "Too noisy"],
    'rating': ['Excellent', 'Poor', 'Very Good', 'Terrible'],  # Assume up to 25 unique categories
    'label': [1, 0, 1, 0]
})

# Step 1: Encode text with RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta = RobertaModel.from_pretrained('roberta-base')

def tokenize_text(text_list, max_len=32):
    return tokenizer(
        text_list,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )

inputs = tokenize_text(df['text'].tolist())
with torch.no_grad():
    roberta_output = roberta(**inputs)
    text_embeddings = roberta_output.last_hidden_state[:, 0, :]  # CLS token

# Step 2: One-hot encode the categorical 'rating' column
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
rating_encoded = encoder.fit_transform(df[['rating']])

# Convert to tensor
rating_tensor = torch.tensor(rating_encoded, dtype=torch.float)

# Step 3: Concatenate embeddings
combined_input = torch.cat((text_embeddings, rating_tensor), dim=1)

# Step 4: Define model
class RobertaWithCategoricalFeatures(nn.Module):
    def __init__(self, text_dim, cat_dim, hidden_dim=64):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(text_dim + cat_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)  # Binary classification
        )

    def forward(self, x):
        return self.fc(x)

# Initialize and run
model = RobertaWithCategoricalFeatures(text_dim=768, cat_dim=rating_tensor.shape[1])
logits = model(combined_input)

print("Logits:", logits.squeeze())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits: tensor([-0.1135, -0.1070, -0.1190, -0.1195], grad_fn=<SqueezeBackward0>)


In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_loc = "/content/drive/MyDrive/kmeans_binary_labeled_helpfulness_data_final.csv"

In [None]:
df = pd.read_csv(dataset_loc, index_col=False)

In [None]:
df.head()

Unnamed: 0,apartmentName,url,rating,review,zip,propertyManager,minRent,maxRent,minSqft,maxSqft,avgRating,helpfulness,days_since_review,state,city,calculated_score,kmeans_label
0,Union 505,https://www.apartments.com/union-505-albuquerq...,5.0,"A very welcoming community, convenient locatio...",NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,0,67.0,NM,ALBUQUERQUE,0.0,0
1,Union 505,https://www.apartments.com/union-505-albuquerq...,3.0,Union 505 is in a great location for Albuquerq...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,0,71.0,NM,ALBUQUERQUE,0.0,0
2,Union 505,https://www.apartments.com/union-505-albuquerq...,4.0,My Experience here at union505 is good only th...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,2,124.0,NM,ALBUQUERQUE,1.0,1
3,Union 505,https://www.apartments.com/union-505-albuquerq...,4.0,I love the apartment. Its very cozy and straig...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,2,129.0,NM,ALBUQUERQUE,1.0,1
4,Union 505,https://www.apartments.com/union-505-albuquerq...,4.0,The apartments are very spacious. The leasing ...,NM 87102,https://www.apartments.com/pmc/greystar/dv0gtnl/,850,2662,324.0,755.0,4.0,2,132.0,NM,ALBUQUERQUE,1.0,1


In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)