In [15]:
import pandas as pd
import tensorflow as tf
import numpy as np
import seaborn as sns

# DATASET NEWSROOM

In [16]:
authors_df = pd.read_csv('/content/drive/MyDrive/Dataset/Newsroom_data.csv')

In [17]:
authors_df.head()

Unnamed: 0,Name,Copywriting,Design,Available Days,Nasional,Internasional,Health,Finance,Technology,Gaming,Sports,Entertainment
0,Aditya,2.259769,3.014136,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",2.824938,3.155594,0.416776,4.741827,3.850363,2.227467,4.595172,3.990902
1,Andhika Mifta Alauddin,1.932117,0.996844,"Jumat, Sabtu, Minggu",1.504586,4.874138,1.941573,2.759315,0.344455,4.4692,1.134803,3.775822
2,Ni Nyoman Ayu Sintya Dewi,0.089198,4.665144,"Senin, Selasa, Rabu, Jumat, Sabtu",0.759569,3.110924,0.428591,1.70763,4.083308,4.602007,3.933427,4.390701
3,Dewa Bagus Trima Putra,1.159456,0.676639,Senin,0.170662,4.661579,4.841004,2.706039,2.772795,0.112621,1.190932,1.988213
4,Patma Ari Ayu Kartini,2.779882,2.468146,"Selasa, Jumat",0.221158,3.525208,4.889649,1.484902,3.54568,2.240629,4.445384,4.9646


# Modeling Content-Based Filtering

In [18]:
!pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
# import library
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt

import string
import nltk
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
# Select features for content-based filtering
features = ['Copywriting', 'Design', 'Nasional', 'Internasional', 'Health', 'Finance', 'Technology', 'Gaming', 'Sports', 'Entertainment']
features_skills = ['Health', 'Finance', 'Technology', 'Gaming', 'Sports', 'Entertainment']

In [21]:
# Normalize features
scaler = MinMaxScaler()
authors_df[features] = scaler.fit_transform(authors_df[features])

In [22]:
authors_df

Unnamed: 0,Name,Copywriting,Design,Available Days,Nasional,Internasional,Health,Finance,Technology,Gaming,Sports,Entertainment
0,Aditya,0.481251,0.616095,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",0.565744,0.5827,0.0,1.0,0.779953,0.471077,0.971995,0.738104
1,Andhika Mifta Alauddin,0.408605,0.14702,"Jumat, Sabtu, Minggu",0.284318,1.0,0.340899,0.528612,0.060912,0.970418,0.142875,0.680254
2,Ni Nyoman Ayu Sintya Dewi,0.0,1.0,"Senin, Selasa, Rabu, Jumat, Sabtu",0.125522,0.571853,0.002642,0.27855,0.827728,1.0,0.813437,0.845638
3,Dewa Bagus Trima Putra,0.237294,0.072563,Senin,0.0,0.948386,0.989124,0.515945,0.55895,0.0,0.156323,0.199441
4,Patma Ari Ayu Kartini,0.596568,0.489138,"Selasa, Jumat",0.010763,0.67245,1.0,0.225591,0.717464,0.474009,0.936105,1.0
5,Risa Pebriyanthi,0.595819,0.655982,"Selasa, Sabtu",0.508522,0.759526,0.953714,0.166345,0.771024,0.566143,0.679667,0.467495
6,Suci Hastika Salma'aini,0.773386,0.906992,"Rabu, Kamis",0.79401,0.02497,0.841711,0.573861,0.388343,0.719664,0.883159,0.100955
7,Ni Luh Santi Wahyuni,0.733215,0.732716,Rabu,0.797656,0.472835,0.573032,0.551989,0.506619,0.801564,0.335033,0.299014
8,Andre Winata,0.759459,0.0,"Senin, Rabu, Minggu",0.772917,0.36287,0.50579,0.0,0.920258,0.809442,0.893347,0.957385
9,Iga Narendra Pramawijaya,0.004349,0.450927,"Selasa, Jumat",0.884148,0.790687,0.346824,0.347393,0.70088,0.077746,0.088472,0.992965


## Predict Topics

In [23]:
def clean_text(text):
  # Remove puncuation
  text = text.translate(string.punctuation)
  # Convert words to lower case and split them
  text = text.lower().split()
  # Remove stop words
  stops = set(stopwords.words("indonesian"))
  text = [w for w in text if not w in stops and len(w) >= 3]
  text = " ".join(text)
  # Clean the text
  text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
  # Stemming
  text = text.split()
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  stemmed_words = [stemmer.stem(word) for word in text]
  text = " ".join(stemmed_words)
  return text

def fit_tokenizer(train_sentences, num_words, oov_token):
    tokenizer = Tokenizer(num_words=num_words , oov_token=oov_token)
    tokenizer.fit_on_texts(train_sentences)
    return tokenizer

def seq_and_pad(sentences, tokenizer, padding, maxlen):
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding=padding)
    return padded_sequences

In [24]:
NUM_WORDS = 1000
EMBEDDING_DIM = 100
MAXLEN = 200
PADDING = 'post'
OOV_TOKEN = "<OOV>"
features = ["Finansial", "Gaming", "Hiburan", "Kesehatan", "Olahraga", "Teknologi"]
model = tf.keras.models.load_model('/content/drive/MyDrive/Dataset/predict_topics.h5')

# Function to predict topic based on news title
# def predict_topic(title):
#   clean = clean_text(title)
#   tokenizer = fit_tokenizer(clean, NUM_WORDS, OOV_TOKEN)
#   padded_sequence = seq_and_pad([clean], tokenizer, PADDING, MAXLEN)
#   prediction = model.predict(padded_sequence)
#   topic_index = np.argmax(prediction)
#   topic = features[topic_index]
#   return topic

In [25]:
def get_topic_preferences(title):
    clean = clean_text(title)
    tokenizer = fit_tokenizer(clean, NUM_WORDS, OOV_TOKEN)
    padded_sequence = seq_and_pad([clean], tokenizer, PADDING, MAXLEN)
    predictions = model.predict(padded_sequence)[0]
    category_probabilities = dict(zip(features, predictions))
    return category_probabilities

In [26]:
news_text = "2 Cara Blur WhatsApp Web biar Chat Gak Diintip Orang"

topic_preferences = get_topic_preferences(news_text)



In [27]:
topic_preferences

{'Finansial': 0.1621101,
 'Gaming': 0.07280009,
 'Hiburan': 0.05702267,
 'Kesehatan': 0.14189863,
 'Olahraga': 0.24649891,
 'Teknologi': 0.31966966}

## User Preference

In [41]:
# 'Copywriting', 'Design', 'Nasional', 'Internasional', 'Health', 'Finance', 'Technology', 'Gaming', 'Sports', 'Entertainment'
user_preferences = {
    # 'Copywriting': 0.759459,
    # 'Design': 0.300000,
    # 'Available Days': 'Senin',
    # 'Nasional': 0.772917,
    # 'Internasional': 0.362870,
    'Health': topic_preferences['Kesehatan'],
    'Finance': topic_preferences['Finansial'],
    'Technology': topic_preferences['Teknologi'],
    'Gaming': topic_preferences['Gaming'],
    'Sports': topic_preferences['Olahraga'],
    'Entertainment': topic_preferences['Hiburan']
}

In [42]:
user_df = pd.DataFrame(user_preferences, index=[0])
user_df

Unnamed: 0,Health,Finance,Technology,Gaming,Sports,Entertainment
0,0.141899,0.16211,0.31967,0.0728,0.246499,0.057023


In [36]:
user_df['Available Days']

0    Senin
Name: Available Days, dtype: object

In [39]:
authors_df[features_skills]

Unnamed: 0,Health,Finance,Technology,Gaming,Sports,Entertainment
0,0.0,1.0,0.779953,0.471077,0.971995,0.738104
1,0.340899,0.528612,0.060912,0.970418,0.142875,0.680254
2,0.002642,0.27855,0.827728,1.0,0.813437,0.845638
3,0.989124,0.515945,0.55895,0.0,0.156323,0.199441
4,1.0,0.225591,0.717464,0.474009,0.936105,1.0
5,0.953714,0.166345,0.771024,0.566143,0.679667,0.467495
6,0.841711,0.573861,0.388343,0.719664,0.883159,0.100955
7,0.573032,0.551989,0.506619,0.801564,0.335033,0.299014
8,0.50579,0.0,0.920258,0.809442,0.893347,0.957385
9,0.346824,0.347393,0.70088,0.077746,0.088472,0.992965


## Calculate Similarity

In [43]:
# Calculate cosine similarity between user preferences and items
similarities = cosine_similarity(user_df, authors_df[features_skills])

In [44]:
similarities

array([[0.85522597, 0.47294798, 0.76367456, 0.76143333, 0.81677793,
        0.86709576, 0.83032179, 0.79834912, 0.80432051, 0.6733296 ,
        0.83798232, 0.80277743, 0.6095518 , 0.80830773]])

In [45]:
# Sort items based on similarity scores
recommended_items = np.argsort(similarities)[0][::-1]
recommended_items

array([ 5,  0, 10,  6,  4, 13,  8, 11,  7,  2,  3,  9, 12,  1])

In [46]:
# Print recommended items
print("Recommended Items:")
for item in recommended_items:
    print(f"Item {item}: {authors_df['Name'][item]}")

Recommended Items:
Item 5: Risa Pebriyanthi
Item 0: Aditya
Item 10: Nyoman Satiya Nanjaya Sadha
Item 6: Suci Hastika Salma'aini
Item 4: Patma Ari Ayu Kartini
Item 13: Visakha Vidyadevi Wiguna
Item 8: Andre Winata
Item 11: Abiyyu Didar Haq
Item 7: Ni Luh Santi Wahyuni
Item 2: Ni Nyoman Ayu Sintya Dewi
Item 3: Dewa Bagus Trima Putra
Item 9: Iga Narendra Pramawijaya
Item 12: Putu Gede Arya Karna Sampalan
Item 1: Andhika Mifta Alauddin


In [47]:
lst = list(recommended_items)
lst

[5, 0, 10, 6, 4, 13, 8, 11, 7, 2, 3, 9, 12, 1]

In [48]:
new_df = authors_df.reindex(lst)
new_df

Unnamed: 0,Name,Copywriting,Design,Available Days,Nasional,Internasional,Health,Finance,Technology,Gaming,Sports,Entertainment
5,Risa Pebriyanthi,0.595819,0.655982,"Selasa, Sabtu",0.508522,0.759526,0.953714,0.166345,0.771024,0.566143,0.679667,0.467495
0,Aditya,0.481251,0.616095,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",0.565744,0.5827,0.0,1.0,0.779953,0.471077,0.971995,0.738104
10,Nyoman Satiya Nanjaya Sadha,0.711196,0.256606,"Selasa, Sabtu",0.839811,0.96172,0.489123,0.180783,0.333294,0.479841,0.500443,0.0
6,Suci Hastika Salma'aini,0.773386,0.906992,"Rabu, Kamis",0.79401,0.02497,0.841711,0.573861,0.388343,0.719664,0.883159,0.100955
4,Patma Ari Ayu Kartini,0.596568,0.489138,"Selasa, Jumat",0.010763,0.67245,1.0,0.225591,0.717464,0.474009,0.936105,1.0
13,Visakha Vidyadevi Wiguna,1.0,0.766588,"Senin, Selasa, Rabu, Jumat, Sabtu",0.710912,0.200761,0.221143,0.303667,1.0,0.429598,0.0,0.061798
8,Andre Winata,0.759459,0.0,"Senin, Rabu, Minggu",0.772917,0.36287,0.50579,0.0,0.920258,0.809442,0.893347,0.957385
11,Abiyyu Didar Haq,0.213276,0.10292,"Rabu, Kamis",0.385916,0.596687,0.997325,0.04625,0.659994,0.298301,0.558024,0.584451
7,Ni Luh Santi Wahyuni,0.733215,0.732716,Rabu,0.797656,0.472835,0.573032,0.551989,0.506619,0.801564,0.335033,0.299014
2,Ni Nyoman Ayu Sintya Dewi,0.0,1.0,"Senin, Selasa, Rabu, Jumat, Sabtu",0.125522,0.571853,0.002642,0.27855,0.827728,1.0,0.813437,0.845638


In [67]:
# def calculate_similarity(user_preferences, writers_data):
#     # similarity_scores = []
#     # for i in range(len(writers_data['Name'])):
#     #     writer_scores = []
#     #     for key in user_preferences:
#     #         if key != 'Available Days':
#     #             writer_scores.append(user_preferences[key] * writers_data[key][i])
#     #     similarity_scores.append(sum(writer_scores))
#     similarity_scores = cosine_similarity(user_preferences, writers_data)
#     return similarity_scores

In [49]:
# Filter Writers by Available Days
day = "Senin"
filtered_df = new_df[new_df['Available Days'].str.contains(day)]
filtered_df

Unnamed: 0,Name,Copywriting,Design,Available Days,Nasional,Internasional,Health,Finance,Technology,Gaming,Sports,Entertainment
0,Aditya,0.481251,0.616095,"Senin, Selasa, Rabu, Kamis, Jumat, Sabtu, Minggu",0.565744,0.5827,0.0,1.0,0.779953,0.471077,0.971995,0.738104
13,Visakha Vidyadevi Wiguna,1.0,0.766588,"Senin, Selasa, Rabu, Jumat, Sabtu",0.710912,0.200761,0.221143,0.303667,1.0,0.429598,0.0,0.061798
8,Andre Winata,0.759459,0.0,"Senin, Rabu, Minggu",0.772917,0.36287,0.50579,0.0,0.920258,0.809442,0.893347,0.957385
2,Ni Nyoman Ayu Sintya Dewi,0.0,1.0,"Senin, Selasa, Rabu, Jumat, Sabtu",0.125522,0.571853,0.002642,0.27855,0.827728,1.0,0.813437,0.845638
3,Dewa Bagus Trima Putra,0.237294,0.072563,Senin,0.0,0.948386,0.989124,0.515945,0.55895,0.0,0.156323,0.199441
12,Putu Gede Arya Karna Sampalan,0.803508,0.350327,"Senin, Rabu, Minggu",1.0,0.0,0.001842,0.70889,0.0,0.882789,1.0,0.546789


## Recommend Writers

In [68]:
# def writers_recommendation(user_preferences, writers_data, top_n=3):
#     filtered_writers = filter_writers(user_preferences, writers_data)
#     similarity_scores = calculate_similarity(user_preferences, writers_data)

#     # Sort writers based on similarity scores
#     sorted_writers = sorted(zip(filtered_writers, similarity_scores), key=lambda x: x[1], reverse=True)

#     # Get top N recommended writers
#     top_writers = sorted_writers[:top_n]

#     return top_writers

In [69]:
# writers_recommendation(user_df, authors_df, 3)