In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
import seaborn as sns

# DATASET NEWSROOM

In [None]:
authors_df = pd.read_csv('/content/drive/MyDrive/Dataset/Newsroom_data.csv')

In [None]:
authors_df.head(10)

Unnamed: 0,Name,Copywriting,Design,Available Days,Nasional,Internasional,Health,Finance,Technology,Gaming,Sports,Entertainment
0,Aditya,4,2,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",4,5,5.0,2.0,3.0,0.5,0,0.5
1,Andhika Mifta Alauddin,4,3,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",5,4,1.0,1.0,3.0,3.0,5,3.0
2,Ni Nyoman Ayu Sintya Dewi,4,3,"Senin, Selasa, Rabu, Jumat, Sabtu",4,3,3.0,4.0,4.0,1.0,2,3.0
3,Dewa Bagus Trima Putra,4,5,"Rabu, Jumat",4,4,3.0,5.0,4.0,3.0,3,5.0
4,Patma Ari Ayu Kartini,4,4,"Selasa, Jumat",4,4,4.0,4.0,4.0,3.0,3,4.0
5,Risa Pebriyanthi,4,2,"Selasa, Sabtu",5,3,3.0,4.5,2.0,1.0,1,2.0
6,Suci Hastika Salma'aini,5,5,"Rabu, Kamis",5,4,3.0,4.5,4.5,2.0,3,3.0
7,Ni Luh Santi Wahyuni,4,2,Rabu,5,3,2.0,4.0,2.0,1.0,1,2.5
8,Andre Winata,4,4,"Senin, Rabu, Minggu",4,4,1.5,2.0,5.0,4.0,3,2.0
9,Iga Narendra Pramawijaya,4,1,"Sabtu, Minggu",3,4,2.0,4.0,5.0,3.0,3,1.0


# Modeling Content-Based Filtering

In [None]:
!pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
# import library
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

import string
import nltk
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Predict Topics

In [None]:
def clean_text(text):
  # Remove puncuation
  text = text.translate(string.punctuation)
  # Convert words to lower case and split them
  text = text.lower().split()
  # Remove stop words
  stops = set(stopwords.words("indonesian"))
  text = [w for w in text if not w in stops and len(w) >= 3]
  text = " ".join(text)
  # Clean the text
  text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
  # Stemming
  text = text.split()
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  stemmed_words = [stemmer.stem(word) for word in text]
  text = " ".join(stemmed_words)
  return text

In [None]:
MAXLEN = 120
PADDING = 'post'
OOV_TOKEN = "<OOV>"
features = ["Finance", "Gaming", "Entertainment", "Health", "Sports", "Technology"]
model = tf.keras.models.load_model('/content/drive/MyDrive/Dataset/model.h5')
# model.load_weights("/content/drive/MyDrive/Dataset/model_weights.h5")

In [None]:
import json
# Memuat word_index dari file JSON
with open("/content/drive/MyDrive/Dataset/word_index.json", "r") as f:
    word_index = json.load(f)

In [None]:
def get_topic_preferences(title):
  clean=clean_text(news_text)
  token = clean.split()
  encoded_text = [word_index[word] for word in token if word in word_index]
  padded_text = pad_sequences([encoded_text], maxlen=MAXLEN, padding=PADDING)
  prediction = model.predict(padded_text)

  category_prob= dict(zip(features, prediction[0]))

  topic_index = np.argmax(prediction)
  topic = features[topic_index]
  print(topic)
  return category_prob

In [None]:
news_text = "Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan"

topic_preferences = get_topic_preferences(news_text)
topic_preferences

Finance


{'Finance': 0.6435556,
 'Gaming': 0.0011855286,
 'Entertainment': 0.002206102,
 'Health': 0.22767814,
 'Sports': 0.065193415,
 'Technology': 0.060181238}

In [None]:
clean_text(news_text)

'ekonomi indonesia tumbuh bangga'

## User Preference

In [None]:
features_cbf = ['Copywriting','Design','Nasional','Internasional','Health','Finance','Technology','Gaming','Sports','Entertainment']

In [None]:
# User Input
user_inputs = ['Copywriting','Nasional']

In [None]:
user_profile = np.array([[1 if feature in user_inputs else (topic_preferences[feature] if feature in topic_preferences else 0) for feature in features_cbf]])
user_profile

array([[1.        , 0.        , 1.        , 0.        , 0.22767814,
        0.64355558, 0.06018124, 0.00118553, 0.06519341, 0.0022061 ]])

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Normalize feature vectors
scaler = MinMaxScaler()
author_features = scaler.fit_transform(authors_df[features_cbf])
author_features

array([[0.66666667, 0.25      , 0.66666667, 1.        , 1.        ,
        0.4       , 0.6       , 0.125     , 0.        , 0.1       ],
       [0.66666667, 0.5       , 1.        , 0.5       , 0.        ,
        0.2       , 0.6       , 0.75      , 1.        , 0.6       ],
       [0.66666667, 0.5       , 0.66666667, 0.        , 0.5       ,
        0.8       , 0.8       , 0.25      , 0.4       , 0.6       ],
       [0.66666667, 1.        , 0.66666667, 0.5       , 0.5       ,
        1.        , 0.8       , 0.75      , 0.6       , 1.        ],
       [0.66666667, 0.75      , 0.66666667, 0.5       , 0.75      ,
        0.8       , 0.8       , 0.75      , 0.6       , 0.8       ],
       [0.66666667, 0.25      , 1.        , 0.        , 0.5       ,
        0.9       , 0.4       , 0.25      , 0.2       , 0.4       ],
       [1.        , 1.        , 1.        , 0.5       , 0.5       ,
        0.9       , 0.9       , 0.5       , 0.6       , 0.6       ],
       [0.66666667, 0.25      , 1.       

## Calculate Similarity

In [None]:
# Calculate cosine similarity between user preferences and items
similarities = cosine_similarity(user_profile, author_features)
similarities 

array([[0.63064535, 0.58445716, 0.71853921, 0.56842612, 0.59333476,
        0.88178744, 0.71834227, 0.87745114, 0.52168319, 0.57075435,
        0.11377262, 0.39579899, 0.61422781, 0.61665721]])

In [None]:
# Sort items based on similarity scores
recommended_items = np.argsort(similarities)[0][::-1]
recommended_items

array([ 5,  7,  2,  6,  0, 13, 12,  4,  1,  9,  3,  8, 11, 10])

In [None]:
# Print recommended
print("Recommended:")
for item in recommended_items:
    print(f"{item}: {authors_df['Name'][item]}")

Recommended:
5: Risa Pebriyanthi
7: Ni Luh Santi Wahyuni 
2: Ni Nyoman Ayu Sintya Dewi
6: Suci Hastika Salma'aini
0: Aditya
13: Visakha Vidyadevi Wiguna
12: Putu Gede Arya Karna Sampalan
4: Patma Ari Ayu Kartini
1: Andhika Mifta Alauddin
9: Iga Narendra Pramawijaya
3: Dewa Bagus Trima Putra
8: Andre Winata
11: Abiyyu Didar Haq
10: Nyoman Satiya Nanjaya Sadha


In [None]:
new_df = authors_df.reindex(list(recommended_items))
new_df

Unnamed: 0,Name,Copywriting,Design,Available Days,Nasional,Internasional,Health,Finance,Technology,Gaming,Sports,Entertainment
5,Risa Pebriyanthi,4,2,"Selasa, Sabtu",5,3,3.0,4.5,2.0,1.0,1,2.0
7,Ni Luh Santi Wahyuni,4,2,Rabu,5,3,2.0,4.0,2.0,1.0,1,2.5
2,Ni Nyoman Ayu Sintya Dewi,4,3,"Senin, Selasa, Rabu, Jumat, Sabtu",4,3,3.0,4.0,4.0,1.0,2,3.0
6,Suci Hastika Salma'aini,5,5,"Rabu, Kamis",5,4,3.0,4.5,4.5,2.0,3,3.0
0,Aditya,4,2,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",4,5,5.0,2.0,3.0,0.5,0,0.5
13,Visakha Vidyadevi Wiguna,5,5,"Sabtu, Minggu",5,4,5.0,0.0,0.0,0.0,2,5.0
12,Putu Gede Arya Karna Sampalan,4,4,"Jumat, Sabtu",4,4,3.0,5.0,4.0,3.0,4,3.0
4,Patma Ari Ayu Kartini,4,4,"Selasa, Jumat",4,4,4.0,4.0,4.0,3.0,3,4.0
1,Andhika Mifta Alauddin,4,3,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",5,4,1.0,1.0,3.0,3.0,5,3.0
9,Iga Narendra Pramawijaya,4,1,"Sabtu, Minggu",3,4,2.0,4.0,5.0,3.0,3,1.0


In [None]:
# Filter Writers by Available Days
day = "Jumat"
filtered_df = new_df[new_df['Available Days'].str.contains(day)]
filtered_df

Unnamed: 0,Name,Copywriting,Design,Available Days,Nasional,Internasional,Health,Finance,Technology,Gaming,Sports,Entertainment
2,Ni Nyoman Ayu Sintya Dewi,4,3,"Senin, Selasa, Rabu, Jumat, Sabtu",4,3,3.0,4.0,4.0,1.0,2,3.0
0,Aditya,4,2,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",4,5,5.0,2.0,3.0,0.5,0,0.5
12,Putu Gede Arya Karna Sampalan,4,4,"Jumat, Sabtu",4,4,3.0,5.0,4.0,3.0,4,3.0
4,Patma Ari Ayu Kartini,4,4,"Selasa, Jumat",4,4,4.0,4.0,4.0,3.0,3,4.0
1,Andhika Mifta Alauddin,4,3,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",5,4,1.0,1.0,3.0,3.0,5,3.0
3,Dewa Bagus Trima Putra,4,5,"Rabu, Jumat",4,4,3.0,5.0,4.0,3.0,3,5.0
