In [30]:
!pip install langchain langchain-community OpenAI indoNLP emoji nlpaug Sastrawi sentence-transformers transformers torch



In [31]:
!pip install \
    "pinecone" \
    "langchain-pinecone" \
    "langchain-openai" \
    "langchain-text-splitters" \
    "langchain"



In [32]:
!python -m spacy download xx_ent_wiki_sm

Collecting xx-ent-wiki-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.7.0/xx_ent_wiki_sm-3.7.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# **Import Modules**

In [56]:
import pandas as pd
import numpy as np
import re
import nltk
import os
import time
import pinecone
import warnings
import spacy

from langchain_pinecone import PineconeVectorStore, PineconeEmbeddings
from langchain.schema import Document
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer


warnings.filterwarnings("ignore")

In [54]:
load_dotenv()

open_api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

# **Import Data**

In [35]:
df = pd.read_csv("/content/dataset_ppn-12-persen_cnn.csv")

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      103 non-null    object
 1   content    101 non-null    object
 2   link       103 non-null    object
 3   timestamp  103 non-null    object
dtypes: object(4)
memory usage: 3.3+ KB


# **Preprocessing Data**

In [37]:
df.dropna(subset=['content'], inplace=True)

In [38]:
def data_cleaning_pipeline(text: str):
    step_1 = re.sub(r'ADVERTISEMENT SCROLL TO CONTINUE WITH CONTENT', '', text, flags=re.IGNORECASE)
    step_2 = re.sub(r'\xa0', ' ', step_1)
    step_3 = re.sub(r'\[Gambas:Video CNN\]', '', step_2)

    return step_3.strip()

In [39]:
df['content'] = df['content'].apply(data_cleaning_pipeline)

# **Feature Engineering**

In [40]:
def feature_engineering(text):
    char_count = len(text)
    word_count = len(text.split())
    sentence_count = len(text.split("."))
    average_word_length = round(sum(len(word) for word in text.split()) / word_count)
    token_count = round(len(text) / 6)

    return char_count, word_count, sentence_count, average_word_length, token_count

In [41]:
df['char_count'], df['word_count'], df['sentence_count'], df['average_word_length'], df['token_count'] = zip(*df['content'].apply(feature_engineering))

In [42]:
nlp = spacy.load("xx_ent_wiki_sm")
nlp.add_pipe('sentencizer')

def split_sentence_using_spacy(text):
    doc = nlp(text)
    data = []
    for sent in doc.sents:

        if len(sent.text) != 0:
            data.append(sent.text)

    return data

In [43]:
df['sentences'] = df['content'].apply(split_sentence_using_spacy)

In [44]:
df.describe().round(1)

Unnamed: 0,char_count,word_count,sentence_count,average_word_length,token_count
count,101.0,101.0,101.0,101.0,101.0
mean,2294.8,319.5,21.9,6.1,382.4
std,1442.0,194.3,13.5,0.4,240.3
min,196.0,29.0,4.0,5.0,33.0
25%,1559.0,224.0,15.0,6.0,260.0
50%,2095.0,289.0,19.0,6.0,349.0
75%,2571.0,365.0,25.0,6.0,428.0
max,10625.0,1440.0,86.0,7.0,1771.0


# **Split sentence into small chunks**

In [45]:
num_sentence_chunk_size = 10

def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

def chunks(text):
    sentence_chunks = split_list(input_list=text, slice_size=num_sentence_chunk_size)
    num_chunks = len(sentence_chunks)

    return sentence_chunks, num_chunks

df['sentence_chunks'], df['num_chunks'] = zip(*df['sentences'].apply(chunks))

In [46]:
df.iloc[0]['sentence_chunks']

[['Ketua DPRPuan Maharanimewanti-wanti potensi gelombang pemutusan hubungan kerja (PHK) buntut keputusan pemerintah yang akan memberlakukan kenaikan pajak pertambahan nilai(PPN) 12 persenmulai 1 Januari 2025.',
  'Puan meyakini kenaikan tersebut akan berdampak pada sektor usaha.',
  'Imbasnya, industri manufaktur, UMKM, dan sektor padat karya akan turun akibat penurunan daya beli masyarakat.',
  '  "Pada akhirnya roda ekonomi di sektor riil berpotensi melambat yang dikhawatirkan memicu gelombang PHK di tahun-tahun mendatang," kata Puan dalam keterangannya, Kamis (19/12).',
  'Puan menilai kenaikan PPN 12 persen juga akan berdampak pada penurunan daya beli rumah tangga.',
  'Kalkulasi Puan, angka penurunan tersebut bisa mencapai 0,37 persen atau Rp40,68 triliun.',
  'Menurut Puan kondisi itu akan menggerus produk domestik bruto (PDB) hingga Rp65,33 triliun.',
  'Puan khawatir hal itu bisa memperburuk ketimpangan ekonomi antara kelompok kaya dan miskin. "',
  'Sektor padat karya seperti 

In [47]:
df.describe().round(2)

Unnamed: 0,char_count,word_count,sentence_count,average_word_length,token_count,num_chunks
count,101.0,101.0,101.0,101.0,101.0,101.0
mean,2294.82,319.5,21.87,6.11,382.45,2.34
std,1442.02,194.34,13.54,0.37,240.31,1.24
min,196.0,29.0,4.0,5.0,33.0,1.0
25%,1559.0,224.0,15.0,6.0,260.0,2.0
50%,2095.0,289.0,19.0,6.0,349.0,2.0
75%,2571.0,365.0,25.0,6.0,428.0,3.0
max,10625.0,1440.0,86.0,7.0,1771.0,9.0


In [48]:
pages_and_chunks = []
for index, data in df.iterrows():
    for chunk in data['sentence_chunks']:
        chunk_dict = {}
        chunk_dict["title"] = data["title"]

        joined_sentence_chunk = "".join(chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4

        pages_and_chunks.append(chunk_dict)

pages_and_chunks_df = pd.DataFrame(pages_and_chunks)

In [49]:
pages_and_chunks_df.describe().round(2)

Unnamed: 0,chunk_char_count,chunk_word_count,chunk_token_count
count,236.0,236.0,236.0
mean,980.23,136.65,245.06
std,427.47,59.53,106.87
min,46.0,6.0,11.5
25%,695.5,100.75,173.88
50%,1012.5,140.0,253.12
75%,1235.0,172.0,308.75
max,2459.0,386.0,614.75


In [50]:
min_token_length = 30
for row in pages_and_chunks_df[pages_and_chunks_df["chunk_token_count"] <= min_token_length].sample().iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 13.5 | Text: Baru akan naik menjadi 12 persen mulai 1 Januari 2025.


In [51]:
pages_and_chunks_over_min_token_len = pages_and_chunks_df[pages_and_chunks_df["chunk_token_count"] > min_token_length].to_dict(orient="records")

# **Create Document**

In [52]:
documents = [Document(metadata={"Title": item['title'], "chunk_char_count" : item['chunk_char_count'],
             "chunk_word_count" : item['chunk_word_count'], "chunk_token_count" : item['chunk_token_count']},
              page_content=item['sentence_chunk']) for item in pages_and_chunks_over_min_token_len]

# **Embeddings**

In [55]:
model_name = 'multilingual-e5-large'
embeddings = PineconeEmbeddings(
    model=model_name,
    pinecone_api_key=pinecone_api_key
)

ValidationError: 1 validation error for PineconeEmbeddings
pinecone_api_key
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type

# **Pinecone Vector DB**

In [None]:
pc = Pinecone(
        api_key=os.getenv("PINECONE_API_KEY")
    )


index_name = 'rag-ppn-12'

if index_name not in pc.list_indexes().names():
    pc.create_index(
            name=index_name,
            dimension=768,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )

    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

In [None]:
namespace = "PPN12%"

docsearch = PineconeVectorStore.from_documents(
    documents=documents,
    index_name=index_name,
    embedding=embeddings,
    namespace=namespace
)