In [1]:
!pip install langchain langchain-community OpenAI indoNLP emoji nlpaug Sastrawi sentence-transformers transformers torch

Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting indoNLP
  Downloading indoNLP-0.3.4-py3-none-any.whl.metadata (3.4 kB)
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain
  Downloading langchain-0.3.13-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.25 (from langchain)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Download

In [2]:
!python -m spacy download xx_ent_wiki_sm

Collecting xx-ent-wiki-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.7.0/xx_ent_wiki_sm-3.7.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xx-ent-wiki-sm
Successfully installed xx-ent-wiki-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# **Import Modules**

In [112]:
import pandas as pd
import numpy as np
import datetime as dt
import re
import nltk
import shutil
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm.auto as tqdm
import spacy
import json
import os
import torch
import emoji
import random


from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from indoNLP.preprocessing import emoji_to_words, replace_slang, pipeline
from __future__ import print_function
from transformers import BertTokenizerFast, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# **Import Data**

In [99]:
df = pd.read_csv("/content/dataset_ppn-12-persen_cnn.csv")

In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      103 non-null    object
 1   content    101 non-null    object
 2   link       103 non-null    object
 3   timestamp  103 non-null    object
dtypes: object(4)
memory usage: 3.3+ KB


# **Preprocessing Data**

In [101]:
df.dropna(subset=['content'], inplace=True)

In [102]:
def data_cleaning_pipeline(text: str):
    step_1 = re.sub(r'ADVERTISEMENT SCROLL TO CONTINUE WITH CONTENT', '', text, flags=re.IGNORECASE)
    step_2 = re.sub(r'\xa0', ' ', step_1)
    step_3 = re.sub(r'\[Gambas:Video CNN\]', '', step_2)

    return step_3.strip()

In [103]:
df['content'] = df['content'].apply(data_cleaning_pipeline)

# **Feature Engineering**

In [105]:
def feature_engineering(text):
    char_count = len(text)
    word_count = len(text.split())
    sentence_count = len(text.split("."))
    average_word_length = round(sum(len(word) for word in text.split()) / word_count)
    token_count = round(len(text) / 6)

    return char_count, word_count, sentence_count, average_word_length, token_count

In [106]:
df['char_count'], df['word_count'], df['sentence_count'], df['average_word_length'], df['token_count'] = zip(*df['content'].apply(feature_engineering))

In [107]:
nlp = spacy.load("xx_ent_wiki_sm")
nlp.add_pipe('sentencizer')

def split_sentence_using_spacy(text):
    doc = nlp(text)
    data = []
    for sent in doc.sents:

        if len(sent.text) != 0:
            data.append(sent.text)

    return data

In [108]:
df['sentences'] = df['content'].apply(split_sentence_using_spacy)

In [109]:
df.describe().round(1)

Unnamed: 0,char_count,word_count,sentence_count,average_word_length,token_count
count,101.0,101.0,101.0,101.0,101.0
mean,2294.8,319.5,21.9,6.1,382.4
std,1442.0,194.3,13.5,0.4,240.3
min,196.0,29.0,4.0,5.0,33.0
25%,1559.0,224.0,15.0,6.0,260.0
50%,2095.0,289.0,19.0,6.0,349.0
75%,2571.0,365.0,25.0,6.0,428.0
max,10625.0,1440.0,86.0,7.0,1771.0


# **Split sentence into small chunks**

In [114]:
num_sentence_chunk_size = 10

def split_list(input_list: list[str],
               slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

def chunks(text):
    sentence_chunks = split_list(input_list=text, slice_size=num_sentence_chunk_size)
    num_chunks = len(sentence_chunks)

    return sentence_chunks, num_chunks

df['sentence_chunks'], df['num_chunks'] = zip(*df['sentences'].apply(chunks))

In [128]:
df.describe().round(2)

Unnamed: 0,char_count,word_count,sentence_count,average_word_length,token_count,num_chunks
count,101.0,101.0,101.0,101.0,101.0,101.0
mean,2294.82,319.5,21.87,6.11,382.45,2.34
std,1442.02,194.34,13.54,0.37,240.31,1.24
min,196.0,29.0,4.0,5.0,33.0,1.0
25%,1559.0,224.0,15.0,6.0,260.0,2.0
50%,2095.0,289.0,19.0,6.0,349.0,2.0
75%,2571.0,365.0,25.0,6.0,428.0,3.0
max,10625.0,1440.0,86.0,7.0,1771.0,9.0
