In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Set base path - use notebook directory
BASE_PATH = os.path.dirname(os.path.abspath('__file__')) if '__file__' in dir() else os.getcwd()
DATASET_PATH = os.path.join(BASE_PATH, 'dataset', 'googleplaystore_user_reviews.csv')
MODEL_SAVE_PATH = os.path.join(BASE_PATH, 'saved_model')

print(f"Base Path: {BASE_PATH}")
print(f"Dataset Path: {DATASET_PATH}")
print(f"Model Save Path: {MODEL_SAVE_PATH}")

Base Path: C:\Users\muham\Project\nlp-ki
Dataset Path: C:\Users\muham\Project\nlp-ki\dataset\googleplaystore_user_reviews.csv
Model Save Path: C:\Users\muham\Project\nlp-ki\saved_model


In [14]:
# Load dataset
df = pd.read_csv(DATASET_PATH)

print(f"Original dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

Original dataset shape: (64295, 5)

Columns: ['App', 'Translated_Review', 'Sentiment', 'Sentiment_Polarity', 'Sentiment_Subjectivity']

First few rows:


Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [15]:
# Check for required columns
print(f"\nChecking data quality...")
print(f"Null values per column:")
print(df.isnull().sum())
print(f"\nSentiment distribution:")
print(df['Sentiment'].value_counts())


Checking data quality...
Null values per column:
App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

Sentiment distribution:
Sentiment
Positive    23998
Negative     8271
Neutral      5163
Name: count, dtype: int64


In [16]:
# Preprocessing
# Drop rows with missing reviews or sentiment
df = df.dropna(subset=['Translated_Review', 'Sentiment'])

# Convert sentiment to numeric labels
sentiment_map = {
    'Positive': 2,
    'Neutral': 1,
    'Negative': 0
}

df['label'] = df['Sentiment'].map(sentiment_map)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

# Rename text column for clarity
df['text'] = df['Translated_Review']

# Filter out very short reviews (less than 3 words)
df = df[df['text'].str.split().str.len() >= 3]

print(f"\nCleaned dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())


Cleaned dataset shape: (33222, 7)

Label distribution:
label
2    21240
0     7963
1     4019
Name: count, dtype: int64


In [17]:
# Balance dataset (optional - sample equal amounts from each class)
min_count = df['label'].value_counts().min()
print(f"\nBalancing dataset to {min_count} samples per class...")

df_balanced = df.groupby('label').sample(n=min(min_count, 10000), random_state=42)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Balanced dataset shape: {df_balanced.shape}")
print(f"\nBalanced label distribution:")
print(df_balanced['label'].value_counts())


Balancing dataset to 4019 samples per class...
Balanced dataset shape: (12057, 7)

Balanced label distribution:
label
2    4019
0    4019
1    4019
Name: count, dtype: int64


In [18]:
# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_balanced['text'].tolist(),
    df_balanced['label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_balanced['label']
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Training samples: 9645
Validation samples: 2412


In [19]:
# Load tokenizer
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)

print(f"Loaded tokenizer: {model_name}")

Loaded tokenizer: roberta-base


In [20]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128
    )

# Create datasets
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

# Tokenize
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set format
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print("Datasets tokenized and formatted successfully!")

Map:   0%|          | 0/9645 [00:00<?, ? examples/s]

Map:   0%|          | 0/2412 [00:00<?, ? examples/s]

Datasets tokenized and formatted successfully!


In [21]:
# Load model for sequence classification
print("Loading RoBERTa model (this may take a few minutes for first download)...")
model = RobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
print("Model loaded successfully!")

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU detected. Training will use CPU (this will be slower).")

Loading RoBERTa model (this may take a few minutes for first download)...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!
Using device: cpu
No GPU detected. Training will use CPU (this will be slower).


In [22]:
# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [23]:
# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(BASE_PATH, 'training_output'),
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=os.path.join(BASE_PATH, 'logs'),
    logging_steps=100,
    eval_strategy='epoch',  # Changed from evaluation_strategy
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to='none'
)

print("Training configuration set!")

Training configuration set!


In [24]:
# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Trainer initialized successfully!")

Trainer initialized successfully!


  trainer = Trainer(


In [25]:
# Start training
print("\n" + "="*50)
print("Starting model training...")
print("="*50 + "\n")

trainer.train()


Starting model training...





Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4645,0.4168,0.856965,0.857479,0.865075,0.856965
2,0.3026,0.318124,0.906302,0.906511,0.907487,0.906302
3,0.1723,0.321639,0.925788,0.926004,0.927106,0.925788




TrainOutput(global_step=1809, training_loss=0.40225221505805625, metrics={'train_runtime': 4210.7898, 'train_samples_per_second': 6.872, 'train_steps_per_second': 0.43, 'total_flos': 1903296685489920.0, 'train_loss': 0.40225221505805625, 'epoch': 3.0})

In [26]:
# Evaluate
print("\n" + "="*50)
print("Evaluating model on validation set...")
print("="*50 + "\n")

eval_results = trainer.evaluate()

print("\nValidation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")



Evaluating model on validation set...






Validation Results:
eval_loss: 0.3216
eval_accuracy: 0.9258
eval_f1: 0.9260
eval_precision: 0.9271
eval_recall: 0.9258
eval_runtime: 70.6405
eval_samples_per_second: 34.1450
eval_steps_per_second: 1.0760
epoch: 3.0000


In [27]:
# Create directory if it doesn't exist
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)

# Save model and tokenizer
print(f"\nSaving model to: {MODEL_SAVE_PATH}")

trainer.save_model(MODEL_SAVE_PATH)
tokenizer.save_pretrained(MODEL_SAVE_PATH)

print("\n" + "="*50)
print("Model and tokenizer saved successfully!")
print("="*50)

# Verify saved files
saved_files = os.listdir(MODEL_SAVE_PATH)
print(f"\nSaved files: {saved_files}")


Saving model to: C:\Users\muham\Project\nlp-ki\saved_model

Model and tokenizer saved successfully!

Saved files: ['config.json', 'merges.txt', 'model.safetensors', 'special_tokens_map.json', 'tokenizer_config.json', 'training_args.bin', 'vocab.json']

Model and tokenizer saved successfully!

Saved files: ['config.json', 'merges.txt', 'model.safetensors', 'special_tokens_map.json', 'tokenizer_config.json', 'training_args.bin', 'vocab.json']


In [28]:
# Test with sample predictions
test_texts = [
    "This app is amazing! I love it so much!",
    "The app keeps crashing. Very disappointed.",
    "It's okay, nothing special but works fine."
]

# Tokenize
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

# Move to device
if device.type == 'cuda':
    model = model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

# Predict
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Map predictions back to labels
label_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}

print("\nTest Predictions:")
print("="*50)
for text, pred in zip(test_texts, predictions):
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {label_map[pred.item()]}")
    print("-" * 50)


Test Predictions:
Text: This app is amazing! I love it so much!
Predicted Sentiment: Positive
--------------------------------------------------
Text: The app keeps crashing. Very disappointed.
Predicted Sentiment: Negative
--------------------------------------------------
Text: It's okay, nothing special but works fine.
Predicted Sentiment: Positive
--------------------------------------------------


In [29]:
# Test Google Play Scraper
from google_play_scraper import app, search, Sort, reviews

# Test search
print("Testing app search...")
try:
    results = search("instagram", lang='en', country='us', n_hits=3)
    print(f"Found {len(results)} apps")
    for r in results[:3]:
        print(f"- {r['title']} ({r['appId']})")
except Exception as e:
    print(f"Search error: {e}")

# Test review scraping
print("\nTesting review scraping...")
try:
    app_id = "com.instagram.android"  # Instagram app ID
    result, token = reviews(
        app_id,
        lang='en',
        country='us',
        sort=Sort.NEWEST,
        count=10
    )
    print(f"Fetched {len(result)} reviews")
    if result:
        print(f"First review: {result[0]['content'][:100]}...")
except Exception as e:
    print(f"Scraping error: {e}")

Testing app search...
Found 3 apps
- Instagram (None)
- Threads (com.instagram.barcelona)
- Edits: Video Editor (com.instagram.basel)

Testing review scraping...
Found 3 apps
- Instagram (None)
- Threads (com.instagram.barcelona)
- Edits: Video Editor (com.instagram.basel)

Testing review scraping...
Fetched 10 reviews
First review: exhalent for Instagram thanks...
Fetched 10 reviews
First review: exhalent for Instagram thanks...
