In [8]:
import pandas as pd
import numpy as np # Digunakan jika ada NaN di kolom kategorikal
import re # Import modul regular expression

# --- 1. Konfigurasi ---
input_csv_path = '../../data/main_dataset/dataset_dm_raw.csv'  # GANTI DENGAN NAMA FILE CSV ANDA
output_prefix = 'transformed_reviews'

# --- 2. Memuat Data ---
try:
    # Coba baca dengan asumsi pemisah koma standar
    df = pd.read_csv(input_csv_path)
    print(f"Berhasil memuat data dari: {input_csv_path}")
    # Tampilkan beberapa baris pertama dan info untuk verifikasi
    print("\nInfo Dataset Awal:")
    df.info()
    print("\nContoh Data Awal:")
    print(df.head())

except FileNotFoundError:
    print(f"Error: File tidak ditemukan di '{input_csv_path}'")
    exit()
except Exception as e:
    print(f"Error saat membaca CSV: {e}")
    print("Pastikan file CSV valid dan path-nya benar.")
    exit()

# --- 3. Pra-pemrosesan dan Ekstraksi Fitur ---

# Salin dataframe agar tidak mengubah yang asli secara tidak sengaja
df_processed = df.copy()

# Tambahkan kolom review_id (menggunakan index + 1)
df_processed.insert(0, 'review_id', df_processed.index + 1)

# --- 3.1 Grouping Versi Aplikasi (DIPERBAIKI) ---
print("\nMelakukan grouping versi aplikasi (Logika Diperbaiki)...")

def group_app_version(version_str):
    """Mengelompokkan versi X.Y.Z, X.YY.Z, dll. menjadi X.Y (hanya 1 digit minor)"""
    if pd.isna(version_str):
        return 'Unknown_Version'
    try:
        # --- PERUBAHAN REGEX DI SINI ---
        # Hanya ambil satu digit setelah titik pertama
        match = re.match(r'^(\d+\.\d)', str(version_str))
        # --------------------------------
        if match:
            return match.group(1) # Mengembalikan bagian yang cocok (X.Y)
        else:
            # Jika format tidak cocok (misal: 'Beta', '1', dll)
            # Coba tangani kasus seperti '4' atau '5' (hanya major)
            match_major_only = re.match(r'^(\d+)$', str(version_str))
            if match_major_only:
                 # Tambahkan '.0' untuk konsistensi
                 return match_major_only.group(1) + '.0'
            return 'Other_Version' # Atau kembalikan string asli jika tidak cocok sama sekali
    except Exception:
        # Tangani error tak terduga jika input bukan string
         return 'Error_Version'

# Terapkan fungsi grouping ke kolom appVersion untuk membuat kolom baru
df_processed['major_minor_version'] = df_processed['appVersion'].apply(group_app_version)

print("Grouping versi selesai. Contoh hasil grouping:")
# Tampilkan contoh yang relevan dengan kasus Anda
print(df_processed.loc[df_processed['appVersion'].astype(str).str.contains(r'\.\d{2,}', na=False), ['appVersion', 'major_minor_version']].head(10))
print("\nNilai unik hasil grouping versi (seharusnya lebih sedikit):")
print(df_processed['major_minor_version'].value_counts().sort_index()) # Urutkan untuk melihat hasilnya

# --- 3.2 Pra-pemrosesan Lanjutan (Termasuk Fitur Waktu Baru) ---

# Konversi 'at' ke datetime
try:
    df_processed['at'] = pd.to_datetime(df_processed['at'])
except Exception as e:
    print(f"\nError saat mengonversi kolom 'at' ke datetime: {e}")
    print("Pastikan format tanggal di kolom 'at' konsisten.")
    exit()

# --- Ekstraksi Fitur Waktu BARU: hour_of_day dan day_of_week ---
print("\nMengekstrak fitur hour_of_day dan day_of_week...")
df_processed['hour_of_day'] = df_processed['at'].dt.hour # Numerik 0-23
df_processed['day_of_week'] = df_processed['at'].dt.strftime('%a') # Nama hari singkat ('Mon', 'Tue', ..., 'Sun')

print("Ekstraksi fitur waktu selesai.")
print(df_processed[['at', 'hour_of_day', 'day_of_week']].head())

# Pastikan kolom kategorikal yang akan di-encode tidak memiliki NaN
cols_to_check_nan = ['score', 'major_minor_version', 'pred_emosi', 'pred_sentimen', 'hour_of_day', 'day_of_week']
for col in cols_to_check_nan:
    if col in df_processed.columns and df_processed[col].isnull().any():
        print(f"Peringatan: Ditemukan NaN di kolom '{col}'. Mengisi dengan 'Unknown'.")
        if pd.api.types.is_categorical_dtype(df_processed[col]) or pd.api.types.is_object_dtype(df_processed[col]):
             if 'Unknown' not in df_processed[col].cat.categories:
                  df_processed[col] = df_processed[col].cat.add_categories('Unknown')
             df_processed[col] = df_processed[col].fillna('Unknown')
        else:
             df_processed[col] = df_processed[col].fillna('Unknown')


# Konversi kolom yang akan di-encode menjadi string untuk konsistensi prefix
cols_to_encode_str = ['score', 'major_minor_version', 'pred_emosi', 'pred_sentimen', 'hour_of_day', 'day_of_week']
for col in cols_to_encode_str:
     if col in df_processed.columns:
          df_processed[col] = df_processed[col].astype(str)


print("\nContoh Data Setelah Pra-pemrosesan Lengkap (dengan hour/weekday):")
print(df_processed[['review_id', 'score', 'major_minor_version', 'hour_of_day', 'day_of_week', 'pred_emosi', 'pred_sentimen']].head())


# --- 4. Fungsi untuk Transformasi dan Penyimpanan ---
# (Fungsi transform_and_save tidak perlu diubah)
def transform_and_save(df_input, columns_to_include, prefixes, output_filename):
    """
    Melakukan one-hot encoding pada kolom terpilih dan menyimpan hasilnya ke CSV.
    """
    print(f"\nMemproses untuk: {output_filename}...")
    cols_for_output = ['review_id'] + columns_to_include
    cols_for_output = [col for col in cols_for_output if col in df_input.columns]
    df_subset = df_input[cols_for_output].copy()
    try:
        valid_columns_to_encode = [col for col in columns_to_include if col in df_subset.columns]
        if not valid_columns_to_encode:
            print(f"Tidak ada kolom valid untuk di-encode di {output_filename}. Menyimpan hanya review_id.")
            binary_df = df_subset[['review_id']]
        else:
            binary_df = pd.get_dummies(
                df_subset,
                columns=valid_columns_to_encode,
                prefix=prefixes,
                prefix_sep='=',
                dtype=int
            )
            if 'review_id' in binary_df.columns and binary_df.columns[0] != 'review_id':
                cols_order = ['review_id'] + [col for col in binary_df.columns if col != 'review_id']
                binary_df = binary_df[cols_order]
        print(f"Jumlah kolom hasil transformasi: {len(binary_df.columns)}")
        print("Contoh hasil transformasi:")
        print(binary_df.head())
        binary_df.to_csv(output_filename, index=False)
        print(f"Berhasil menyimpan ke: {output_filename}")
    except Exception as e:
        print(f"Error saat transformasi atau penyimpanan untuk {output_filename}: {e}")


# --- 5. Menjalankan Transformasi untuk Setiap Skenario ---
# score + grouped_version (fixed) + hour_of_day + day_of_week + emosi + sentimen
transform_and_save(
    df_input=df_processed,
    columns_to_include=['score', 'major_minor_version', 'hour_of_day', 'day_of_week', 'pred_emosi', 'pred_sentimen'],
    prefixes={'score': 'score', 'major_minor_version': 'version', 'hour_of_day': 'hour', 'day_of_week': 'weekday', 'pred_emosi': 'emotion', 'pred_sentimen': 'sentiment'},
    output_filename=f"{output_prefix}_score_groupedVersionFixed_hourWeekday_nlp.csv" # Nama file disesuaikan
)

print("\n--- Proses Selesai ---")


Berhasil memuat data dari: ../../data/main_dataset/dataset_dm_raw.csv

Info Dataset Awal:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224988 entries, 0 to 224987
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   userName       224988 non-null  object
 1   content        224986 non-null  object
 2   score          224988 non-null  int64 
 3   appVersion     224988 non-null  object
 4   at             224988 non-null  object
 5   pred_emosi     224988 non-null  object
 6   pred_sentimen  224988 non-null  object
dtypes: int64(1), object(6)
memory usage: 12.0+ MB

Contoh Data Awal:
                  userName                                            content  \
0                Yuga Edit                            akun gopay saya di blok   
1                 ff burik  Lambat sekali sekarang ini bosssku apk gojek g...   
2  Anisa Suci Rahmayuliani  Kenapa sih dari kemarin sy buka aplikasi gojek...   
3         

In [2]:
import pandas as pd

# Membaca data dari file CSV
file_path = '../../data/main_dataset/dataset_dm_raw.csv'  # Ganti dengan path file kamu
df = pd.read_csv(file_path)

# Menampilkan jumlah total row
total_rows = len(df)
print(f"Total row: {total_rows}")

# Menghitung jumlah row dengan kondisi tertentu
# Misal: kolom 'status' bernilai 'aktif'
# jumlah_aktif = df[df['pred_sentimen'] == 'Neutral']
# print(f"Jumlah row dengan status 'netral': {jumlah_aktif}")
df




Total row: 224988


Unnamed: 0,userName,content,score,appVersion,at,pred_emosi,pred_sentimen
0,Yuga Edit,akun gopay saya di blok,1,4.9.3,2022-01-21 10:52:12,Sad,Negative
1,ff burik,Lambat sekali sekarang ini bosssku apk gojek g...,3,4.9.3,2021-11-30 15:40:38,Sad,Negative
2,Anisa Suci Rahmayuliani,Kenapa sih dari kemarin sy buka aplikasi gojek...,4,4.9.3,2021-11-29 22:58:12,Sad,Negative
3,naoki yakuza,Baru download gojek dan hape baru trus ditop u...,1,4.9.3,2022-09-03 15:21:17,Sad,Negative
4,Trio Sugianto,Mantap,5,4.9.3,2022-01-15 10:05:27,Happy,Positive
...,...,...,...,...,...,...,...
224983,Sad Gamer,Gofood Biaya lain2ya gak ngotak mending hujan2...,1,4.0.0,2023-02-15 09:37:58,Anger,Negative
224984,fadhil fadil,Yok lah,5,4.0.0,2021-12-11 12:28:20,Neutral,Neutral
224985,g sugiarto,Sempurna,5,4.0.0,2022-03-14 01:45:39,Neutral,Neutral
224986,J i H A D 'B E,GOJEK LAMA LAMA GAK JELAS LAGI PESEN MASA MAP...,1,4.0.0,2021-12-24 08:48:51,Sad,Negative


In [3]:
df1 = pd.read_csv('transformed_reviews_score_version.csv')
df2 = pd.read_csv('transformed_reviews_score_version_time.csv')
df3 = pd.read_csv('transformed_reviews_score_version_time_nlp.csv')

In [4]:
df1

Unnamed: 0,review_id,score=1,score=2,score=3,score=4,score=5,version=4.0.0,version=4.0.1,version=4.0.2,version=4.1.0,...,version=4.80.1,version=4.80.2,version=4.80.3,version=4.80.4,version=4.81.1,version=4.81.2,version=4.82.1,version=4.9.0,version=4.9.1,version=4.9.3
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224983,224984,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
224984,224985,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
224985,224986,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
224986,224987,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df2

Unnamed: 0,review_id,score=1,score=2,score=3,score=4,score=5,version=4.0.0,version=4.0.1,version=4.0.2,version=4.1.0,...,month=Oct,month=Sep,year=2021,year=2022,year=2023,year=2024,time=Afternoon,time=Evening,time=Morning,time=Night
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,2,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,3,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,4,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224983,224984,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
224984,224985,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
224985,224986,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
224986,224987,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [6]:
df3

Unnamed: 0,review_id,score=1,score=2,score=3,score=4,score=5,version=4.0.0,version=4.0.1,version=4.0.2,version=4.1.0,...,time=Night,emotion=Anger,emotion=Fear,emotion=Happy,emotion=Love,emotion=Neutral,emotion=Sad,sentiment=Negative,sentiment=Neutral,sentiment=Positive
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,2,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,3,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
3,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224983,224984,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
224984,224985,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
224985,224986,0,0,0,0,1,1,0,0,0,...,1,0,0,0,0,1,0,0,1,0
224986,224987,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0
