#### Importing the dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import json
import pandas as pd

path_reviews = '/content/review-Alaska_10.json'
file=[]
with open(path_reviews, 'r', encoding='utf-8') as g:
    for line in g:
        try:
            file.append(json.loads(line))
        except json.JSONDecodeError:
            continue

df= pd.DataFrame(file)
display(df.head())

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,109129804842686204152,Nicki Gore,1566331951619,5,We always stay here when in Valdez for silver ...,,,0x56b646ed2220b77f:0xd8975e316de80952
1,113240926405758965692,Allen Ratliff,1504917982385,5,Great campground for the price. Nice hot unlim...,,,0x56b646ed2220b77f:0xd8975e316de80952
2,113044837891141253355,Jonathan Tringali,1474765901185,4,We tent camped here for 2 nights while explori...,,,0x56b646ed2220b77f:0xd8975e316de80952
3,110329155147592031570,S Blad,1472858535682,4,"This place is just a few miles outside Valdez,...",,,0x56b646ed2220b77f:0xd8975e316de80952
4,108989634908602011119,Daniel Formoso,1529649811341,5,Probably the nicest and cleanest campground we...,,,0x56b646ed2220b77f:0xd8975e316de80952


In [13]:
col=df.columns
print(col)

Index(['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id'], dtype='object')


In [22]:
import numpy as np

numeric_cols = df.select_dtypes(include=[np.number, np.int32, np.int64, np.float32, np.float64]).columns.tolist()


numeric_cols = ['time', 'rating', 'num_images', 'review_len', 'response_present','review_year', 'review_month', 'review_day', 'review_weekday','review_hour', 'user_review_count', 'user_reviews_at_place']

print (numeric_cols)

['time', 'rating', 'num_images', 'review_len', 'response_present', 'review_year', 'review_month', 'review_day', 'review_weekday', 'review_hour', 'user_review_count', 'user_reviews_at_place']


### Helper Columns

In [19]:
# Count number of pictures
df['pics'] = df['pics'].apply(lambda x: x if isinstance(x, list) else [])
df['num_images'] = df['pics'].apply(len)

# Length of review
df['review_len'] = df['text'].fillna('').apply(lambda x: len(x.split()))

# If there is response
df['response_present'] = df['resp'].apply(lambda x: 0 if x is None else 1)

# Time of review
df['review_time'] = pd.to_datetime(df['time'], unit='ms')

df['review_year'] = df['review_time'].dt.year
df['review_month'] = df['review_time'].dt.month
df['review_day'] = df['review_time'].dt.day
df['review_weekday'] = df['review_time'].dt.weekday
df['review_hour'] = df['review_time'].dt.hour

# Number of reviews a person makes
user_counts = df['user_id'].value_counts()
df['user_review_count'] = df['user_id'].map(user_counts)

# Number of reviews user makes at the same place
place_counts = df.groupby(['user_id', 'gmap_id']).size().reset_index(name='user_reviews_at_place')
df = df.merge(place_counts, on=['user_id', 'gmap_id'], how='left')

df.head()

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id,num_images,review_len,response_present,review_time,review_year,review_month,review_day,review_weekday,review_hour,user_review_count,user_reviews_at_place
0,109129804842686204152,Nicki Gore,1566331951619,5,We always stay here when in Valdez for silver ...,[],,0x56b646ed2220b77f:0xd8975e316de80952,0,30,0,2019-08-20 20:12:31.619,2019,8,20,1,20,38,2
1,113240926405758965692,Allen Ratliff,1504917982385,5,Great campground for the price. Nice hot unlim...,[],,0x56b646ed2220b77f:0xd8975e316de80952,0,21,0,2017-09-09 00:46:22.385,2017,9,9,5,0,43,2
2,113044837891141253355,Jonathan Tringali,1474765901185,4,We tent camped here for 2 nights while explori...,[],,0x56b646ed2220b77f:0xd8975e316de80952,0,137,0,2016-09-25 01:11:41.185,2016,9,25,6,1,16,2
3,110329155147592031570,S Blad,1472858535682,4,"This place is just a few miles outside Valdez,...",[],,0x56b646ed2220b77f:0xd8975e316de80952,0,61,0,2016-09-02 23:22:15.682,2016,9,2,4,23,23,2
4,108989634908602011119,Daniel Formoso,1529649811341,5,Probably the nicest and cleanest campground we...,[],,0x56b646ed2220b77f:0xd8975e316de80952,0,11,0,2018-06-22 06:43:31.341,2018,6,22,4,6,49,2


### Loading trained model

In [20]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch import nn

# Paths to saved models and tokenizer
bert_numeric_model_path = "/content/drive/MyDrive/tiktok/best_model.pt"
bert_tokenizer_path = "/content/drive/MyDrive/tiktok/bert_tokenizer.pth"

#loading the pretrained model
tokenizer = AutoTokenizer.from_pretrained(bert_tokenizer_path)
class BertWithNumeric(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_numeric_features=12, num_labels=3):
        super().__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size + num_numeric_features, 128),
            nn.ReLU(),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, numeric_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        combined = torch.cat([cls_embedding, numeric_features], dim=1)
        logits = self.classifier(combined)
        return logits

# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertWithNumeric(num_numeric_features=12, num_labels=3)
model.load_state_dict(torch.load(bert_numeric_model_path, map_location=device))
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertWithNumeric(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

#### Identifying ads and rants for a more balanced dataset

In [23]:
import re
from textblob import TextBlob


def check_advertisement(text):
    text = str(text)
    text_lower = text.lower()
    WHITELIST = ['bomb.com']
    if any(domain in text_lower for domain in WHITELIST):
        return 0
    if re.search(r"(https?://\S+|www\.\S+|\b\S+\.(com|net|org|io|co|biz)\b)", text_lower):
        return 1
    return 0

from textblob import TextBlob
import re

def check_rant(text):
    text = str(text)
    text_lower = text.lower()
    rant_patterns = [
        r"never been (here|there|to)",
        r"haven't (visited|gone)",
        r"didn't visit",
        r"never went",
        r"never gone",
        r"not visited"
    ]
    positive_context = [
        r"i had a wonderful time",
        r"i would highly recommend",
        r"enjoyed",
        r"fun",
        r"amazing",
        r"great experience"
    ]
    if any(re.search(p, text_lower) for p in rant_patterns):
        if any(re.search(p, text_lower) for p in positive_context):
            return 0
        # Only calculate sentiment if it might be a rant based on patterns
        sentiment = TextBlob(text).sentiment.polarity
        if sentiment < -0.1:
            return 1
    return 0


df['rule_ad'] = df['text'].apply(check_advertisement)
df['rule_rant'] = df['text'].apply(check_rant)

print(df['rule_ad'].value_counts())
print(df['rule_rant'].value_counts())

rule_ad
0    521484
1        31
Name: count, dtype: int64
rule_rant
0    521505
1        10
Name: count, dtype: int64


In [24]:
ads=df[df['rule_ad']==1]
rants=df[df['rule_rant']==1]
df_random=df.sample(n=500-41, random_state=0).reset_index(drop=True)
df_random=pd.concat([ads, rants, df_random]).reset_index(drop=True)

#### Prediction of labels

In [25]:
import torch
import pandas as pd



#preprocessing
numeric_data=df_random[numeric_cols].fillna(0).values
numeric_features=torch.tensor(numeric_data, dtype=torch.float32)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(bert_tokenizer_path)
encoded = tokenizer(
    df_random['text'].astype(str).tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

input_ids,attention_mask=encoded["input_ids"],encoded["attention_mask"]


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
numeric_features=numeric_features.to(device)
input_ids=input_ids.to(device)
attention_mask=attention_mask.to(device)


model.eval()
with torch.no_grad():
    logits = model(input_ids, attention_mask, numeric_features)
    predictions = torch.argmax(logits, dim=1)


label_map = {0: "ad", 1: "rant", 2: "normal"}
predicted_output= [label_map[p.item()] for p in predictions]

df_random["predicted_label"] = predicted_output

#output
df_random[["text", "predicted_label"]].head()
print(df_random['predicted_label'].value_counts())


predicted_label
normal    500
Name: count, dtype: int64


#### Manual Ground truth labelling

In [26]:
df_sample_truth = df_random.sample(50, random_state=42).copy()
df_sample_truth['truth_label'] = None

for i, row in df_sample_truth.iterrows():
    print(f"Review: {row['text']}\n")
    label = input("Enter label (ad/rant/normal): ").strip().lower()
    df_sample_truth.at[i, 'truth_label'] = label


Review: Very good burgers, decent beer selection, although weak on imports. Service attentive, but food slow to come out of the kitchen.

Enter label (ad/rant/normal): normal
Review: Always awesome but too freaking hot

Enter label (ad/rant/normal): normal
Review: Without a doubt the most poorly managed HD store ive ever been in, and ive been in quite a few. Listless employees who view your question as an irritation, give you half an answer,  or negative body language if you interrupt their conversation with a co worker. This is not just from one visit, but nearly every time i go. I have sinced learned over multiple visits to seek out and sometimes wait a considerable time to engage those few that do actually want to be helpful,  sometimes on the other end of the store. Home Depot if you are listening send some secret shoppers after 5pm on a weeday and see that if what i say is true.

Enter label (ad/rant/normal): normal
Review: We love Boom! Come here almost everyday, love that they a

In [27]:
df_sample_truth['predicted_label']= df_random.loc[df_sample_truth.index, 'predicted_label']
df_sample_truth.head()

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id,num_images,review_len,...,review_month,review_day,review_weekday,review_hour,user_review_count,user_reviews_at_place,rule_ad,rule_rant,predicted_label,truth_label
361,117749820110390509871,Tom Reale,1532142422690,3,"Very good burgers, decent beer selection, alth...",[],,0x56c899d6f8747fb1:0xa09590480fd69f18,0,21,...,7,21,5,3,76,1,0,0,normal,normal
73,116493309364864889323,Lynda Parker,1596773486750,5,Always awesome but too freaking hot,[],,0x56c89787c10200e5:0xc7409e59ca0db14d,0,6,...,8,7,4,4,50,1,0,0,normal,normal
374,105484017921869108041,A101stNCO,1566056426943,1,Without a doubt the most poorly managed HD sto...,[],,0x5400e04416caf75b:0x104b7e77025fc898,0,119,...,8,17,5,15,22,1,0,0,normal,normal
155,117511613153564121711,Brady Beers,1581738944113,5,"We love Boom! Come here almost everyday, love ...",[],,0x56c89792b5a8770b:0xbbd478ec813d7c26,0,19,...,2,15,5,3,29,1,0,0,normal,normal
104,114838479273269236095,mike bruner,1542358167600,2,,[],,0x51324fe830814b1d:0x69d0b12fe5d04894,0,0,...,11,16,4,8,52,1,0,0,normal,normal


#### Classification Report and Conclusion

In [28]:
from sklearn.metrics import classification_report
labels=['ad', 'rant', 'normal']
print(classification_report(df_sample_truth['truth_label'], df_sample_truth['predicted_label'], labels=labels))

              precision    recall  f1-score   support

          ad       0.00      0.00      0.00         3
        rant       0.00      0.00      0.00         1
      normal       0.92      1.00      0.96        46

    accuracy                           0.92        50
   macro avg       0.31      0.33      0.32        50
weighted avg       0.85      0.92      0.88        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


This notebook takes in another dataset as input to the trained model. After comparing the predicted output of the inputs as well as the ground truths that were manually labelled, a few key observations can be made.

1. The model does not perform optimally on minority classes. While oversampling of minority classes was used in the training data, the model did not perform optimally in predicting 'ads' and 'rants'. From the manual labelling, it can be observed that there are 'ads' and 'rants' but the model was unable to predict those.

This could be owed to the original data set itself where the data set is highly skewed and the review quality was not optimum for classficiation. Several reviews had no text and just numerical inputs. While valid, they did not provide great significance to the nature of the review. Using better datasets would have led to a better prediction of 'ads' and 'rants'.
