In [None]:
# data Cleaning Cell
import pandas as pd
import re


cleaned_data = []
current_id = None
current_text = []


# Look for Start of line (^), digit (\d+), then comma (,)
new_row_pattern = re.compile(r'^(\d+),(.*)')

print("Starting manual parsing... this handles broken newlines and commas.")

with open('/content/drive/MyDrive/Steam/reviews only.csv', 'r', encoding='utf-8', errors='replace') as f:
    for i, line in enumerate(f):
        line = line.strip()

        # Check if this line is the start of a new row (ID, Text...)
        match = new_row_pattern.match(line)

        if match:
            # 1. SAVE PREVIOUS: If we were building a row, save it
            if current_id is not None:
                full_review = " ".join(current_text) 
                cleaned_data.append({'recommendationid': current_id, 'review_text': full_review})

            # 2. START NEW: Startnew row
            current_id = match.group(1)      # ID (ex., 10000000)
            initial_text = match.group(2)    # start of review
            current_text = [initial_text]    # start text buffer

        else:
            # 3. APPEND: This line is continued from previous review
            # (only if we have   valid ID started)
            if current_id is not None:
                current_text.append(line)

    # save last processed row
    if current_id is not None:
        full_review = " ".join(current_text)
        cleaned_data.append({'recommendationid': current_id, 'review_text': full_review})

# Convert to df
df_clean = pd.DataFrame(cleaned_data)

print(f"Successfully reconstructed {len(df_clean)} rows.")


In [None]:
print(df_clean.head(10))

In [None]:
demo = df_clean.head(100).copy()

In [None]:
import pandas as pd
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import torch
from tqdm.auto import tqdm
import os
import time

In [None]:
# congifuration
drive_path = '/content/drive/MyDrive/Steam/reviews_scored_partial.csv'
batch_save_size = 50000   # vave to drive every 50k
status_update_interval = 10000 # print status every 10k

In [None]:
import pandas as pd
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import torch
from tqdm.auto import tqdm
import os
import time


# delete previous file
if os.path.exists(drive_path):
    print(f" Deleting bad file: {drive_path}")
    os.remove(drive_path)


pd.DataFrame(columns=['recommendationid', 'numeric_score', 'category']).to_csv(drive_path, index=False)
print("Created fresh output file.")


if 'df_clean' not in locals():
    raise ValueError("df_clean is missing! run the 'Data Cleaning' cell again.")

# model setup
device = 0 if torch.cuda.is_available() else -1
print(f"Using Device: {'GPU üöÄ' if device == 0 else 'CPU üê¢'}")

sentiment_pipeline = pipeline(
    task='sentiment-analysis',
    model='nlptown/bert-base-multilingual-uncased-sentiment',
    device=device,
    truncation=True,
    max_length=512,
    batch_size=32 
)

star_map = {1: 'Worse', 2: 'Bad', 3: 'Neutral', 4: 'Good', 5: 'Best'}

# loop of processign
print(f"Processing {len(df_clean)} rows...")

# intialize data stream
data_stream = KeyDataset(df_clean.to_dict('records'), "review_text")

current_batch = []
start_time = time.time()

print("-" * 50)

#iterate over pipeline result
pipeline_iterator = sentiment_pipeline(data_stream, batch_size=32)

for i, out in tqdm(enumerate(pipeline_iterator), total=len(df_clean)):

    #just printing first row to check if it works
    if i == 0:
        print(f"\n[DEBUG CHECK] Row 0 Output: {out}")
        # ex {'label': '1 star', 'score': ...}

    try:
        
        if isinstance(out, list):
            result = out[0]
        else:
            result = out

        star = int(result['label'].split()[0])
        category = star_map.get(star, 'Unknown')

    except Exception as e:
        if i < 5: print(f" Error on row {i}: {e}")
        star = -1
        category = "Error"

    #save result
    original_idx = df_clean.index[i]
    current_batch.append({
        'recommendationid': df_clean.at[original_idx, 'recommendationid'],
        'numeric_score': star,
        'category': category
    })

    
    if (i + 1) % status_interval == 0:
        elapsed = time.time() - start_time
        print(f"Status: {i + 1} rows done. (Last 10k took: {elapsed:.2f}s)")
        start_time = time.time()

    if (i + 1) % batch_save_size == 0:
        pd.DataFrame(current_batch).to_csv(drive_path, mode='a', header=False, index=False)
        print(f"   SAVED to Drive: {i + 1} rows safe.")
        current_batch = []

# Final Save
if current_batch:
    pd.DataFrame(current_batch).to_csv(drive_path, mode='a', header=False, index=False)
    print("   Final Batch Saved.")

print(f"\n COMPLETE! File saved at: {drive_path}")

In [None]:
res1 = pd.read_csv("/content/drive/MyDrive/Steam/reviews_scored_final.csv")
res1.head(10)