In [None]:
# ================================
# 1. Setup & Import
# ================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)

# ================================
# 2. Load Dataset
# ================================
df = pd.read_csv("tokopedia-product-reviews-2019.csv", on_bad_lines="skip", encoding="utf-8")

print("Shape:", df.shape)
print("Columns:", df.columns)

df.head()

# ================================
# 3. EDA Singkat
# ================================
print("\n--- Info Dataset ---")
print(df.info())

print("\n--- Null Values ---")
print(df.isna().sum())

# Tambahkan panjang teks
df["text_len"] = df["text"].astype(str).str.len()

print("\n--- Statistik Panjang Review ---")
print(df["text_len"].describe())

# Distribusi rating
print("\n--- Distribusi Rating ---")
print(df["rating"].value_counts())

# ================================
# 4. Plot Distribusi Rating & Panjang Review
# ================================
plt.figure()
df["rating"].astype(int).plot(kind="hist", bins=[0.5,1.5,2.5,3.5,4.5,5.5])
plt.title("Rating Distribution (1-5)")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

plt.figure()
df.loc[df["text_len"] <= 300, "text_len"].plot(kind="hist", bins=40)
plt.title("Text Length Distribution (<=300 chars)")
plt.xlabel("Text length (characters)")
plt.ylabel("Frequency")
plt.show()

# ================================
# 5. Buat Label Sentiment (berdasarkan rating)
# rule: rating <=2 = neg, 3 = neu, >=4 = pos
# ================================
def map_sentiment(r):
    try:
        r = int(r)
    except:
        return np.nan
    if r <= 2:
        return "neg"
    elif r == 3:
        return "neu"
    else:
        return "pos"

df["sentiment"] = df["rating"].apply(map_sentiment)

print("\n--- Distribusi Sentiment ---")
print(df["sentiment"].value_counts())

# ================================
# 6. Simpan versi bersih
# ================================
keep_cols = ["text","rating","sentiment","category","product_name",
             "product_id","sold","shop_id","product_url","text_len"]

df_clean = df[keep_cols].copy()
df_clean.to_csv("tokopedia_reviews_clean.csv", index=False)
print("Dataset cleaned disimpan ke tokopedia_reviews_clean.csv")

df.head()