In [None]:
# Import Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Load Dataset
folder_path = '/content/drive/My Drive/bangkit/project/dataset/food-recipes/'
file_name = 'food_recipes.csv'
file_path = f'{folder_path}{file_name}'

# Baca dataset dan simpan dalam variabel "dataset"
try:
    dataset = pd.read_csv(file_path)  # Membaca dataset dari file CSV
    print("Dataset berhasil dimuat!")
except FileNotFoundError:
    print(f"File tidak ditemukan di lokasi: {file_path}. Periksa kembali path file Anda!")

# Step 3: Eksplorasi Dataset
# Cek apakah dataset berhasil dimuat
if 'dataset' in locals():
    # Menampilkan dimensi dataset
    print(f"Dimensi Dataset: {dataset.shape}")
    # Menampilkan 5 baris pertama
    print("5 Baris Pertama Dataset:")
    print(dataset.head())
    # Menampilkan nama-nama kolom
    print("Kolom Dataset:")
    print(dataset.columns)
    print(dataset.shape[0])
    # Menampilkan informasi dataset
    print("Informasi Dataset:")
    print(dataset.info())
    # Memeriksa jumlah nilai kosong
    print("Jumlah Nilai Kosong di Setiap Kolom:")
    print(dataset.isnull().sum())
else:
    print("Dataset tidak berhasil dimuat. Periksa kembali file CSV Anda.")


Mounted at /content/drive
Dataset berhasil dimuat!
Dimensi Dataset: (15593, 7)
5 Baris Pertama Dataset:
                      Title  \
0          Ayam Woku Manado   
1  Ayam goreng tulang lunak   
2          Ayam cabai kawin   
3               Ayam Geprek   
4               Minyak Ayam   

                                         Ingredients  \
0  1 Ekor Ayam Kampung (potong 12)--2 Buah Jeruk ...   
1  1 kg ayam (dipotong sesuai selera jangan kecil...   
2  1/4 kg ayam--3 buah cabai hijau besar--7 buah ...   
3  250 gr daging ayam (saya pakai fillet)--Secuku...   
4  400 gr kulit ayam & lemaknya--8 siung bawang p...   

                                               Steps  Loves  \
0  Cuci bersih ayam dan tiriskan. Lalu peras jeru...      1   
1  Haluskan bumbu2nya (BaPut, ketumbar, kemiri, k...      1   
2  Panaskan minyak di dalam wajan. Setelah minyak...      2   
3  Goreng ayam seperti ayam krispi--Ulek semua ba...     10   
4  Cuci bersih kulit ayam. Sisihkan--Ambil 50 ml ...     

In [None]:
# prompt: check the amount of data in the dataset

# Check the amount of data in the dataset
if 'dataset' in locals():
    num_rows = dataset.shape[0]
    num_cols = dataset.shape[1]
    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {num_cols}")
    print(f"Total number of data points: {num_rows * num_cols}")
else:
    print("Dataset not loaded. Please check the file path and try again.")

Number of rows: 15593
Number of columns: 7
Total number of data points: 109151


In [None]:
!pip install Sastrawi



In [None]:
print(dataset.columns)

Index(['Title', 'Ingredients', 'Steps', 'Loves', 'URL', 'category',
       'valid_url'],
      dtype='object')


In [None]:
!pip install Sastrawi

import pandas as pd
import os
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory  # Perbaikan di sini

# Step 1: Create stemmer using Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Step 2: Load dataset
file_path = '/content/drive/My Drive/bangkit/project/dataset/food-recipes/food_recipes.csv'  # Sesuaikan dengan lokasi file dataset di Colab
checkpoint_path = '/content/drive/My Drive/bangkit/project/dataset/food-recipes/processed_checkpoint.csv'

try:
    dataset = pd.read_csv(file_path)
    print("Dataset successfully loaded!")
except FileNotFoundError:
    print(f"File not found at location: {file_path}")
    dataset = None

# Step 3: Define stemming function with checkpoint support
def stem_with_checkpoint(dataframe, text_column, checkpoint_file, start_index=0):
    """
    Perform stemming with checkpoint support.
    Args:
        dataframe: Original DataFrame.
        text_column: Column containing the text to stem.
        checkpoint_file: Path to save the checkpoint.
        start_index: Row index to start processing.
    Returns:
        DataFrame with stemmed text.
    """
    if os.path.exists(checkpoint_file):
        # Load checkpoint if it exists
        processed_data = pd.read_csv(checkpoint_file)
        print(f"Checkpoint found. Resuming from index {len(processed_data)}.")
    else:
        # Create new DataFrame if no checkpoint exists
        processed_data = pd.DataFrame(columns=[text_column, 'stemmed_text'])
        print("No checkpoint found. Starting from scratch.")

    for idx in range(start_index, len(dataframe)):
        try:
            # Process the original text column
            original_text = dataframe.loc[idx, text_column]
            stemmed_text = stemmer.stem(original_text)  # Apply stemming

            # Add to the result DataFrame
            processed_data = pd.concat(
                [processed_data, pd.DataFrame({text_column: [original_text], 'stemmed_text': [stemmed_text]})],
                ignore_index=True
            )

            # Save checkpoint every 10 rows or at the end
            if idx % 10 == 0 or idx == len(dataframe) - 1:
                processed_data.to_csv(checkpoint_file, index=False)
                print(f"Checkpoint saved at index {idx}.")

        except Exception as e:
            print(f"Error at index {idx}: {e}")
            break

    return processed_data

# Step 4: Apply stemming process
if dataset is not None:
    text_column = "Ingredients"  # Ubah sesuai dengan kolom yang ingin di-stemming

    if text_column in dataset.columns:
        processed_data = stem_with_checkpoint(dataset, text_column, checkpoint_path, start_index=0)
        print("Stemming process completed!")

        # Save the final processed dataset
        final_output_path = '/content/drive/My Drive/bangkit/project/dataset/food-recipes/final_dataset.csv'
        processed_data.to_csv(final_output_path, index=False)
        print(f"Final processed dataset saved at: {final_output_path}")
    else:
        print(f"Column '{text_column}' not found in the dataset.")
else:
    print("Dataset not available. Please check the dataset file.")


Dataset successfully loaded!
Checkpoint found. Resuming from index 8222.
Checkpoint saved at index 0.
Checkpoint saved at index 10.
Checkpoint saved at index 20.
Checkpoint saved at index 30.
Checkpoint saved at index 40.
Checkpoint saved at index 50.
Checkpoint saved at index 60.
Checkpoint saved at index 70.
Checkpoint saved at index 80.
Checkpoint saved at index 90.
Checkpoint saved at index 100.
Checkpoint saved at index 110.
Checkpoint saved at index 120.
Checkpoint saved at index 130.
Checkpoint saved at index 140.
Checkpoint saved at index 150.
Checkpoint saved at index 160.
Checkpoint saved at index 170.
Checkpoint saved at index 180.
Checkpoint saved at index 190.
Checkpoint saved at index 200.
Checkpoint saved at index 210.
Checkpoint saved at index 220.
Checkpoint saved at index 230.
Checkpoint saved at index 240.
Checkpoint saved at index 250.
Checkpoint saved at index 260.
Checkpoint saved at index 270.
Checkpoint saved at index 280.
Checkpoint saved at index 290.
Checkpoi