In [None]:
# Reading dataset
import pandas as pd

df = pd.read_csv('../models/news.tsv', header=None, sep="\t")

df.columns=['id',
"category",
"subcategory",
"title",
"abstract",
"url",
"title_entities",
"abstract_entities "]

In [229]:
# Mengambil 1000 data acak
df_sampled = df.sample(n=1000, random_state=42)  # Anda dapat mengganti nilai random_state sesuai kebutuhan

# Menyimpan dataset baru
df_sampled.to_csv('sampled_dataset.csv', index=False)

In [230]:
df_sampled.head()

Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities
36810,N7433,news,newsus,The 3 best Greek spots in Aurora,Hoodline crunched the numbers to find the top ...,https://assets.msn.com/labs/mind/BBWHLdc.html,[],[]
24447,N43326,sports,more_sports,Officials investigating stabbing death of Alex...,,https://assets.msn.com/labs/mind/AAJD8WV.html,"[{""Label"": ""Prison"", ""Type"": ""C"", ""WikidataId""...",[]
23187,N45745,finance,finance-top-stocks,"SmileDirectClub tanks on California bill, brin...",Shares of SmileDirectClub sank to a new all-ti...,https://assets.msn.com/labs/mind/AAILU2e.html,"[{""Label"": ""SmileDirectClub"", ""Type"": ""N"", ""Wi...","[{""Label"": ""SmileDirectClub"", ""Type"": ""N"", ""Wi..."
22077,N5234,travel,traveltips,The world's 50 most reliable airlines,No one wants to kick off their holiday with de...,https://assets.msn.com/labs/mind/AAJub6N.html,[],"[{""Label"": ""Get Going"", ""Type"": ""N"", ""Wikidata..."
32260,N58530,sports,basketball_ncaa,High school football: Breaking down the Class ...,The Class 3A football playoffs begin Friday ac...,https://assets.msn.com/labs/mind/BBWAERc.html,"[{""Label"": ""South African Class 3A 4-8-2"", ""Ty...","[{""Label"": ""Heritage Hall School"", ""Type"": ""F""..."


In [231]:
df_sampled['corpus'] = df_sampled[['title', 'abstract']].apply(lambda x: ' '.join(map(str, x)), axis=1)


In [4]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [5]:
preprocessor = hub.KerasLayer(
    "../models/bert_preprocessor",)
encoder = hub.KerasLayer(
    "../models/bert_encoder",
    trainable=True)


def get_bert_embeddings(text, preprocessor, encoder):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
    encoder_inputs = preprocessor(text_input)
    outputs = encoder(encoder_inputs)
    embedding_model = tf.keras.Model(text_input, outputs['pooled_output'])
    embedding_model.compile(optimizer='adam', loss='mse')
    sentences = tf.constant([text])
    emb = embedding_model(sentences).numpy()
    print(emb)
    return emb


: 

In [None]:
df_sampled['encodings'] = df_sampled['corpus'].apply(lambda x: get_bert_embeddings(x, preprocessor, encoder))


In [238]:
df_sampled.head()

Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,corpus,encodings
36810,N7433,news,newsus,The 3 best Greek spots in Aurora,Hoodline crunched the numbers to find the top ...,https://assets.msn.com/labs/mind/BBWHLdc.html,[],[],The 3 best Greek spots in Aurora Hoodline crun...,"[[-0.88768613, 0.9936917, -0.012753431, -0.927..."
24447,N43326,sports,more_sports,Officials investigating stabbing death of Alex...,,https://assets.msn.com/labs/mind/AAJD8WV.html,"[{""Label"": ""Prison"", ""Type"": ""C"", ""WikidataId""...",[],Officials investigating stabbing death of Alex...,"[[-0.71360165, 0.9078096, 0.031543188, -0.9471..."
23187,N45745,finance,finance-top-stocks,"SmileDirectClub tanks on California bill, brin...",Shares of SmileDirectClub sank to a new all-ti...,https://assets.msn.com/labs/mind/AAILU2e.html,"[{""Label"": ""SmileDirectClub"", ""Type"": ""N"", ""Wi...","[{""Label"": ""SmileDirectClub"", ""Type"": ""N"", ""Wi...","SmileDirectClub tanks on California bill, brin...","[[-0.69691885, 0.99604875, 0.055066865, -0.982..."
22077,N5234,travel,traveltips,The world's 50 most reliable airlines,No one wants to kick off their holiday with de...,https://assets.msn.com/labs/mind/AAJub6N.html,[],"[{""Label"": ""Get Going"", ""Type"": ""N"", ""Wikidata...",The world's 50 most reliable airlines No one w...,"[[-0.5948372, 0.9595088, 0.006537994, -0.86174..."
32260,N58530,sports,basketball_ncaa,High school football: Breaking down the Class ...,The Class 3A football playoffs begin Friday ac...,https://assets.msn.com/labs/mind/BBWAERc.html,"[{""Label"": ""South African Class 3A 4-8-2"", ""Ty...","[{""Label"": ""Heritage Hall School"", ""Type"": ""F""...",High school football: Breaking down the Class ...,"[[-0.7885393, 0.91186565, 0.008350249, -0.9251..."


In [239]:
df_sampled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 36810 to 18618
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  1000 non-null   object
 1   category            1000 non-null   object
 2   subcategory         1000 non-null   object
 3   title               1000 non-null   object
 4   abstract            954 non-null    object
 5   url                 1000 non-null   object
 6   title_entities      1000 non-null   object
 7   abstract_entities   1000 non-null   object
 8   corpus              1000 non-null   object
 9   encodings           1000 non-null   object
dtypes: object(10)
memory usage: 85.9+ KB


In [244]:
import re
from sklearn import metrics

def preprocess_text():
  text = input()
  text = text.lower()
  text = re.sub('[^A-Za-z0-9]+', ' ', text)
  return text
  
query_text = preprocess_text()
query_encoding = get_bert_embeddings(query_text, preprocessor, encoder)

df_sampled['similarity_score'] = df_sampled['encodings'].apply(lambda x: metrics.pairwise.cosine_similarity(x, query_encoding)[0][0])
df_results = df_sampled.sort_values(by=['similarity_score'], ascending=False)

 foodball


[[-0.8863599   0.9589162  -0.03594382 -0.98949367 -0.9884794  -0.3459113
  -0.8367941   0.08099159  0.86258954 -0.9053912  -0.9294881   0.30097944
   0.77174187  0.98076636 -0.00978982  0.9272214  -0.9963344   0.02808348
  -0.8501529   0.91581637 -0.86578435 -0.7428405  -0.7980818   0.9396636
  -0.98776275  0.8862722  -0.17799507 -0.99847865 -0.8128746  -0.98041326
   0.9649863   0.9654179   0.99694437  0.9335473   0.5981218  -0.01160499
  -0.9704506  -0.66767526  0.469542   -0.23900217 -0.97021896 -0.8280334
   0.14835533  0.99970907 -0.75166255  0.04118969 -0.7701593  -0.76283234
   0.9723812   0.9948737  -0.72955745 -0.97511554  0.44438463  0.42426747
  -0.02824351 -0.90026426  0.22260025  0.17239353 -0.83641386  0.95690715
  -0.9987625  -0.930684   -0.10681489 -0.07957909 -0.7871317   0.9832969
   0.04694906 -0.9642784  -0.02843418 -0.8033848  -0.9945915   0.04037812
  -0.78310126 -0.03518964  0.99929035  0.23422825  0.00609834 -0.19585001
   0.5542175   0.10316495  0.7932266   0.9

In [245]:
df_results.head()

Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,corpus,encodings,similarity_score
33344,N38838,sports,more_sports,"Fantasy Football: Start 'em, sit 'em for Week 11",Questionable fantasy football advice every week,https://assets.msn.com/labs/mind/BBWH0Id.html,[],[],"Fantasy Football: Start 'em, sit 'em for Week ...","[[-0.8269854, 0.92667043, -0.009696479, -0.908...",0.938967
5297,N57287,sports,football_ncaa,Pigskin Poll: How will the Cleveland Browns fi...,,https://assets.msn.com/labs/mind/AAJATJG.html,"[{""Label"": ""Cleveland Browns"", ""Type"": ""O"", ""W...",[],Pigskin Poll: How will the Cleveland Browns fi...,"[[-0.925933, 0.93152684, -0.029294569, -0.9728...",0.933846
7294,N5217,sports,football_nfl,Patriots release Josh Gordon: Should Dolphins ...,,https://assets.msn.com/labs/mind/AAJF1VI.html,"[{""Label"": ""Josh Gordon"", ""Type"": ""P"", ""Wikida...",[],Patriots release Josh Gordon: Should Dolphins ...,"[[-0.9573617, 0.9234477, -0.028576326, -0.8975...",0.932811
40462,N36757,sports,basketball_nba,"De'Aaron Fox suffers ankle sprain, per report",Ugh,https://assets.msn.com/labs/mind/BBWBDYl.html,"[{""Label"": ""De'Aaron Fox"", ""Type"": ""P"", ""Wikid...",[],"De'Aaron Fox suffers ankle sprain, per report Ugh","[[-0.8546091, 0.9091533, 0.036464766, -0.96581...",0.931382
31201,N57415,lifestyle,lifestylelovesex,"17 Winter Date Ideas That Will Melt Your Cold,...",*Calls cuff right now*,https://assets.msn.com/labs/mind/BBPSDiZ.html,[],[],"17 Winter Date Ideas That Will Melt Your Cold,...","[[-0.98657054, 0.9769207, -0.19236892, -0.9693...",0.929504


In [242]:
def hitung_mrr(df_results):
    indeks_relevan = df_results[df_results['similarity_score'] > 0]['similarity_score'].index
    reciprocal_ranks = []

    for idx in indeks_relevan:
        peringkat = df_results.index.get_loc(idx) + 1
        reciprocal_rank = 1 / peringkat
        reciprocal_ranks.append(reciprocal_rank)

    if reciprocal_ranks:
        mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
        return mrr
    else:
        return 0.0

skor_mrr = hitung_mrr(df_results)
print(f"Mean Reciprocal Rank (MRR): {skor_mrr}")

Mean Reciprocal Rank (MRR): 0.007485470860550343


In [250]:
df_sampled.encodings

36810    [[-0.88768613, 0.9936917, -0.012753431, -0.927...
24447    [[-0.71360165, 0.9078096, 0.031543188, -0.9471...
23187    [[-0.69691885, 0.99604875, 0.055066865, -0.982...
22077    [[-0.5948372, 0.9595088, 0.006537994, -0.86174...
32260    [[-0.7885393, 0.91186565, 0.008350249, -0.9251...
                               ...                        
22605    [[-0.81792533, 0.98405755, -0.09422249, -0.929...
6234     [[-0.8103939, 0.9874413, 0.15309896, -0.889150...
21726    [[-0.9600377, 0.98605675, -0.0405233, -0.97683...
21504    [[-0.38789156, 0.9881241, -0.017219935, -0.910...
18618    [[-0.8083892, 0.9927196, 0.08153918, -0.886835...
Name: encodings, Length: 1000, dtype: object

In [1]:
!pip show tensorflow


Name: tensorflow
Version: 2.13.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /opt/conda/lib/python3.10/site-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, libclang, numpy, opt-einsum, packaging, protobuf, setuptools, six, tensorboard, tensorflow-estimator, tensorflow-io-gcs-filesystem, termcolor, typing-extensions, wrapt
Required-by: explainable-ai-sdk, tensorflow-cloud, tensorflow-decision-forests, tensorflow-serving-api, tensorflow-text, tensorflowjs, witwidget
Note: you may need to restart the kernel to use updated packages.


In [251]:
df_sampled.to_csv('output_dataset.csv', index=False)

In [252]:
df = pd.read_csv('/kaggle/working/output_dataset.csv')

In [253]:
df.encodings

0      [[-0.88768613  0.9936917  -0.01275343 -0.92733...
1      [[-7.13601649e-01  9.07809615e-01  3.15431878e...
2      [[-0.69691885  0.99604875  0.05506686 -0.98216...
3      [[-0.5948372   0.9595088   0.00653799 -0.86174...
4      [[-0.7885393   0.91186565  0.00835025 -0.92517...
                             ...                        
995    [[-0.81792533  0.98405755 -0.09422249 -0.92918...
996    [[-0.8103939   0.9874413   0.15309896 -0.88915...
997    [[-0.9600377   0.98605675 -0.0405233  -0.97683...
998    [[-0.38789156  0.9881241  -0.01721993 -0.91077...
999    [[-8.08389187e-01  9.92719591e-01  8.15391764e...
Name: encodings, Length: 1000, dtype: object

In [254]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  1000 non-null   object 
 1   category            1000 non-null   object 
 2   subcategory         1000 non-null   object 
 3   title               1000 non-null   object 
 4   abstract            954 non-null    object 
 5   url                 1000 non-null   object 
 6   title_entities      1000 non-null   object 
 7   abstract_entities   1000 non-null   object 
 8   corpus              1000 non-null   object 
 9   encodings           1000 non-null   object 
 10  similarity_score    1000 non-null   float64
dtypes: float64(1), object(10)
memory usage: 86.1+ KB


In [264]:
df = pd.read_csv('/kaggle/working/output_dataset.csv')

In [265]:
df['encodings'] = df['encodings'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))


In [267]:
import re
from sklearn import metrics

def preprocess_text():
  text = input()
  text = text.lower()
  text = re.sub('[^A-Za-z0-9]+', ' ', text)
  return text
  
query_text = preprocess_text()
query_encoding = get_bert_embeddings(query_text, preprocessor, encoder)

df['similarity_score'] = df['encodings'].apply(lambda x: metrics.pairwise.cosine_similarity(x.reshape(1, -1), query_encoding.reshape(1, -1))[0][0])
df_results = df.sort_values(by=['similarity_score'], ascending=False)

 foodball


[[-0.8863599   0.9589162  -0.03594382 -0.98949367 -0.9884794  -0.3459113
  -0.8367941   0.08099159  0.86258954 -0.9053912  -0.9294881   0.30097944
   0.77174187  0.98076636 -0.00978982  0.9272214  -0.9963344   0.02808348
  -0.8501529   0.91581637 -0.86578435 -0.7428405  -0.7980818   0.9396636
  -0.98776275  0.8862722  -0.17799507 -0.99847865 -0.8128746  -0.98041326
   0.9649863   0.9654179   0.99694437  0.9335473   0.5981218  -0.01160499
  -0.9704506  -0.66767526  0.469542   -0.23900217 -0.97021896 -0.8280334
   0.14835533  0.99970907 -0.75166255  0.04118969 -0.7701593  -0.76283234
   0.9723812   0.9948737  -0.72955745 -0.97511554  0.44438463  0.42426747
  -0.02824351 -0.90026426  0.22260025  0.17239353 -0.83641386  0.95690715
  -0.9987625  -0.930684   -0.10681489 -0.07957909 -0.7871317   0.9832969
   0.04694906 -0.9642784  -0.02843418 -0.8033848  -0.9945915   0.04037812
  -0.78310126 -0.03518964  0.99929035  0.23422825  0.00609834 -0.19585001
   0.5542175   0.10316495  0.7932266   0.9

In [150]:
df.encodings[1]

'[[-7.13601649e-01 9.07809615e-01 3.15431878e-02 -9.47138727e-01 -9.52696681e-01 -4.15526032e-01 -7.95670629e-01 -4.93095815e-02 9.17906225e-01 -9.19885993e-01 -7.52385736e-01 6.70269251e-01 1.07117556e-01 9.88221347e-01 -5.00168577e-02 9.62122262e-01 -9.97160256e-01 2.46147096e-01 -9.15880799e-01 5.44235826e-01 -8.70210350e-01 -4.15583313e-01 -9.54319179e-01 8.82264972e-01 -9.93970454e-01 9.25391078e-01 7.78293073e-01 -9.89409089e-01 3.11202884e-01 -9.79765892e-01 8.71633410e-01 8.78490865e-01 9.92934287e-01 7.27473021e-01 -1.23550981e-01 -1.25989512e-01 -9.97903883e-01 -1.28183916e-01 8.39935541e-01 1.47923842e-01 -8.49081635e-01 -9.47495639e-01 3.97986639e-03 9.95476484e-01 -7.65434325e-01 -4.60150272e-01 -8.96885216e-01 -4.63224351e-01 9.42373991e-01 9.56772149e-01 -9.76478815e-01 -8.15842688e-01 7.63803422e-01 -4.75163817e-01 -1.19623639e-01 -6.27890944e-01 1.08177364e-01 -8.06506258e-03 -9.81450319e-01 9.72026527e-01 -9.95928884e-01 -6.97246492e-02 5.37368536e-01 -2.68909074e-02 

In [268]:
df_results.head()

Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,corpus,encodings,similarity_score
555,N38838,sports,more_sports,"Fantasy Football: Start 'em, sit 'em for Week 11",Questionable fantasy football advice every week,https://assets.msn.com/labs/mind/BBWH0Id.html,[],[],"Fantasy Football: Start 'em, sit 'em for Week ...","[-0.8269854, 0.92667043, -0.00969648, -0.90889...",0.938967
650,N57287,sports,football_ncaa,Pigskin Poll: How will the Cleveland Browns fi...,,https://assets.msn.com/labs/mind/AAJATJG.html,"[{""Label"": ""Cleveland Browns"", ""Type"": ""O"", ""W...",[],Pigskin Poll: How will the Cleveland Browns fi...,"[-0.925933, 0.93152684, -0.02929457, -0.972871...",0.933846
634,N5217,sports,football_nfl,Patriots release Josh Gordon: Should Dolphins ...,,https://assets.msn.com/labs/mind/AAJF1VI.html,"[{""Label"": ""Josh Gordon"", ""Type"": ""P"", ""Wikida...",[],Patriots release Josh Gordon: Should Dolphins ...,"[-0.9573617, 0.9234477, -0.02857633, -0.897563...",0.932811
554,N36757,sports,basketball_nba,"De'Aaron Fox suffers ankle sprain, per report",Ugh,https://assets.msn.com/labs/mind/BBWBDYl.html,"[{""Label"": ""De'Aaron Fox"", ""Type"": ""P"", ""Wikid...",[],"De'Aaron Fox suffers ankle sprain, per report Ugh","[-0.8546091, 0.9091533, 0.03646477, -0.9658108...",0.931382
61,N57415,lifestyle,lifestylelovesex,"17 Winter Date Ideas That Will Melt Your Cold,...",*Calls cuff right now*,https://assets.msn.com/labs/mind/BBPSDiZ.html,[],[],"17 Winter Date Ideas That Will Melt Your Cold,...","[-0.98657054, 0.9769207, -0.19236892, -0.96933...",0.929503


In [None]:
preprocessor_path = "/kaggle/working/bert_preprocessor"
encoder_path = "/kaggle/working/bert_encoder"
tf.saved_model.save(preprocessor, preprocessor_path)
tf.saved_model.save(encoder, encoder_path)


In [None]:
df.to_csv('/kaggle/working/similarity_results.csv', index=False)

In [None]:
!kaggle kernels output -q notebook-id -p /kaggle/working


In [None]:
!zip -r output.zip /kaggle/working/


In [None]:
# Menyimpan path model preprocessor dan encoder ke dalam file
with open('/kaggle/working/model_paths.txt', 'w') as file:
    file.write(f"Preprocessor: {preprocessor_path}\n")
    file.write(f"Encoder: {encoder_path}")