In [1]:
import pandas as pd
import sqlite3 as sql
from datetime import date, datetime
import numpy as np
import os

from sentence_transformers import SentenceTransformer, util
import torch

In [2]:
def calculate_cosine(model, target_sentence, target_df):
  target_embedding = model.encode(target_sentence, convert_to_tensor=True)
  title_list = list(target_df["content"])
  list_embedding = model.encode(title_list, convert_to_tensor=True)
  cosine_scores = util.cos_sim(target_embedding, list_embedding)
  cosine_list = torch.squeeze(cosine_scores).tolist()
  return list(target_df.id), cosine_list


# Loading Model
print("Loading Model......")
l1 = datetime.now()
model = SentenceTransformer('all-MiniLM-L6-v2')
l2 = datetime.now()
print(f"Model loaded, Load time {l2-l1}")

df = pd.read_csv("bitesizenews/cosine.csv")

Loading Model......
Model loaded, Load time 0:01:37.639658


In [52]:
df = pd.read_csv("cosine.csv")

In [53]:
df

Unnamed: 0,id


In [47]:
last_id = df.iloc[-1]["id"].astype("int")
last_id

866

In [54]:
# Read from database
def get_batch(last_id):
    conn = sql.connect('bitesizenews/db.sqlite3')
    cursor = conn.execute(f"SELECT id, content from backendservice_article WHERE id <= {last_id}")

    # Get all contents
    results = cursor.fetchall()

    contents = []
    ids = []

    for row in results:
        id = row[0]
        content = row[1]

        ids.append(id)
        contents.append(content)
        
    return ids, contents

In [52]:
ids

[862, 865, 866]

In [37]:
# Taking dummy content
target_id = ids[2]
target_content = [contents[2]]

In [38]:
df

Unnamed: 0,id,862,865
0,862.0,1.0,
1,865.0,0.185027,1.0


In [12]:
df.drop(columns=["Unnamed: 0"])

Unnamed: 0,id,862,865
0,862.0,1.0,


In [86]:
df = pd.read_csv("bitesizenews/cosine.csv")
df = df.drop(columns=["Unnamed: 0"])
df["id"] = df["id"].astype("int")
df

Unnamed: 0,id,862,865,866
0,862,1.0,,
1,865,0.185027,1.0,
2,866,0.184423,0.085743,1.0


In [81]:
def cosine_calculation(target_id, target_content):

    # Check if cosine is in database
    if target_id not in list(df.columns):

        last_id = df.iloc[-1]["id"].astype("int")

        ids, contents = get_batch(last_id)
        
        ids.append(target_id)
        contents.append(target_content)
        
        print(ids)

        target_dict = {
            "id":ids,
            "content":contents
        }

        target_df = pd.DataFrame.from_dict(target_dict)

        id, cosine = calculate_cosine(model, target_content, target_df)

        new_row = [target_id] + cosine

        print(new_row)

        df[target_id] = np.nan

        df.loc[len(df)] = new_row

        df.to_csv("bitesizenews/cosine.csv")
        
    else:
        print("Cosine calculation already exists")

In [76]:
df

Unnamed: 0,id,862,865,866
0,862,1.0,,
1,865,0.185027,1.0,
2,866,0.184423,0.085743,1.0


In [87]:
conn = sql.connect('bitesizenews/db.sqlite3')
last_id = df.iloc[-1]["id"].astype("int")
cursor = conn.execute(f"SELECT id, content from backendservice_article WHERE id>{last_id}")

# Get all contents
results = cursor.fetchall()

for row in results:
    id = row[0]
    content = row[1]

    cosine_calculation(id, content)

[862, 865, 866, 867]
[867, 0.3049933910369873, 0.09434103965759277, 0.24876222014427185, 1.0000004768371582]
[862, 865, 866, 867, 868]
[868, 0.13372179865837097, 0.11143714934587479, 0.1498621106147766, 0.051367782056331635, 1.0]
[862, 865, 866, 867, 868, 869]
[869, 0.28122442960739136, 0.10843876004219055, 0.057251688092947006, 0.1883423775434494, 0.20070967078208923, 1.0000003576278687]
[862, 865, 866, 867, 868, 869, 870]
[870, 0.1281472146511078, 0.1374461054801941, 0.34352371096611023, 0.0998057946562767, 0.08480081707239151, 0.04480593651533127, 0.9999997615814209]
[862, 865, 866, 867, 868, 869, 870, 871]
[871, 0.2453959584236145, 0.2018672376871109, 0.183623805642128, 0.21320562064647675, 0.20677629113197327, 0.27872636914253235, 0.1899610459804535, 1.000000238418579]
[862, 865, 866, 867, 868, 869, 870, 871, 872]
[872, 0.02693689987063408, 0.13240642845630646, 0.04227787256240845, 0.12169063091278076, -0.011008333414793015, 0.13152197003364563, 0.07900343090295792, 0.131907254457

[889, 0.1510159820318222, 0.23392227292060852, 0.28156742453575134, 0.2855285704135895, 0.03895879164338112, 0.08249262720346451, 0.24414944648742676, 0.13945993781089783, 0.13404132425785065, -0.04608665779232979, 0.16324234008789062, 0.004081699997186661, 0.0417117103934288, 0.22240179777145386, 0.26850587129592896, 0.26586753129959106, 0.1946299970149994, 0.2810690402984619, 0.11183285713195801, 0.09266482293605804, 0.16170662641525269, 0.22007319331169128, 0.08517077565193176, 0.12674088776111603, 0.279771089553833, 1.0000001192092896]
[862, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890]
[890, 0.2049514353275299, 0.2229272425174713, 0.14363603293895721, 0.19593632221221924, 0.1029953807592392, 0.21024003624916077, 0.08565044403076172, 0.1934959590435028, 0.1635391116142273, 0.06449270248413086, 0.133743017911911, 0.1236434131860733, 0.023356370627880096, 0.20386065542697906, 0.09217868745326996, 0.07

[900, 0.21578317880630493, 0.12754374742507935, 0.6396634578704834, 0.2518681287765503, 0.19910505414009094, 0.11344760656356812, 0.1692315936088562, 0.12553781270980835, -0.01234225183725357, -0.016068046912550926, 0.0771205797791481, -0.035780563950538635, 0.019152743741869926, 0.10098735988140106, 0.5394852161407471, 0.34314435720443726, 0.46295586228370667, 0.2611604928970337, 0.19511854648590088, 0.02380785346031189, 0.3664003312587738, 0.20842993259429932, 0.17077289521694183, 0.03724586218595505, 0.21933013200759888, 0.20476551353931427, 0.1127348318696022, 0.17213720083236694, 0.22195133566856384, 0.046580128371715546, 0.1271606683731079, 0.1572594940662384, 0.1210709735751152, 0.1693722903728485, 0.2275230437517166, -0.01234225183725357, 1.0000004768371582]
[862, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901]
[901, 0.3017449378967285, 0.1077

[908, 0.11123421788215637, 0.12064078450202942, 0.04260334372520447, 0.07423773407936096, -0.09035709500312805, -0.09352141618728638, -0.0006467103958129883, 0.07988840341567993, 0.022044677287340164, -0.01898670755326748, -0.060186464339494705, -0.04563910886645317, 0.03060241788625717, 0.04900302365422249, 0.1376911997795105, 0.09621711075305939, 0.1501753032207489, 0.15995757281780243, -0.0040960125625133514, -0.0061440011486411095, -0.022349651902914047, 0.11275296658277512, 0.03194866329431534, 0.20010903477668762, 0.22967937588691711, 0.10352681577205658, 0.010148227214813232, -0.02420242689549923, 0.1291106939315796, 0.17640338838100433, 0.03706338629126549, 0.12550047039985657, 0.13310852646827698, 0.10660279542207718, 0.044940948486328125, 0.022044677287340164, 0.04191381484270096, 0.02948743849992752, -0.01897665113210678, 0.022044677287340164, 0.03577645123004913, -0.023244645446538925, 0.2588001787662506, 0.07442063093185425, 1.0000001192092896]
[862, 865, 866, 867, 868, 86

[915, 0.129940003156662, 0.03847396373748779, 0.0661640614271164, -0.004511252045631409, 0.14069299399852753, 0.2899283766746521, 0.046213313937187195, -0.029949575662612915, 0.03601914271712303, -0.09070812910795212, 0.829289436340332, 0.059167370200157166, 0.003303634002804756, 0.13928678631782532, 0.035601235926151276, -0.01829194650053978, 0.008523840457201004, -0.04716749116778374, 0.027475157752633095, 0.013188900426030159, -0.015513002872467041, 0.08574672788381577, -0.001637912355363369, 0.15228645503520966, 0.0642080307006836, 0.1420196145772934, 0.12688681483268738, 0.1349705159664154, -0.07403934001922607, -0.0019555799663066864, -0.004593957215547562, -0.009233072400093079, 0.06596997380256653, -0.05421220511198044, 0.0654139369726181, 0.03601914271712303, 0.012490712106227875, 0.0660218670964241, 0.08975400775671005, 0.03601914271712303, 0.0339273139834404, 0.15917423367500305, 0.08059585839509964, 0.07574818283319473, -0.03678988292813301, 0.03838516026735306, 0.004232348

[921, -0.004821963608264923, 0.0015366319566965103, 0.09017038345336914, -0.05536548048257828, 0.025772154331207275, 0.039790403097867966, -0.03749832883477211, 0.10917481780052185, -0.02990611083805561, 0.10235162824392319, -0.04939406365156174, -0.02482972852885723, 0.10851170867681503, -0.032824721187353134, 0.12649157643318176, 0.05160653591156006, 0.1997402310371399, -0.06302361190319061, -0.01963764801621437, -0.056560926139354706, 0.21901309490203857, 0.09767788648605347, 0.22410336136817932, -0.020054567605257034, 0.01121751219034195, -0.04439833387732506, -0.026300929486751556, 0.020496122539043427, 0.0866822898387909, -0.06115318089723587, 0.004697860684245825, -0.0248822383582592, -0.06424402445554733, 0.0037648603320121765, -0.1366572082042694, -0.02990611083805561, 0.19511999189853668, 0.14318349957466125, 0.1448804885149002, -0.02990611083805561, 0.0909506231546402, 0.005711941048502922, 0.03184216469526291, 0.12742801010608673, 0.0279405415058136, 0.09806191921234131, 0.

[927, 0.27883854508399963, 0.1795160472393036, 0.22792388498783112, 0.25446397066116333, 0.20176514983177185, 0.0555449016392231, 0.22849859297275543, 0.2481461763381958, 0.04569263011217117, -0.04826759919524193, 0.1397085338830948, 0.07428056001663208, 0.0524883046746254, 0.1778249740600586, 0.3255797326564789, 0.2928623557090759, 0.2796410322189331, 0.3361567258834839, 0.16485634446144104, 0.1705639660358429, 0.25081178545951843, 0.35548561811447144, 0.05933336913585663, 0.1538916528224945, 0.3351694941520691, 0.19171860814094543, 0.24421720206737518, 0.11413522809743881, 0.30014699697494507, 0.20624136924743652, 0.22514423727989197, 0.25043028593063354, 0.24805888533592224, 0.35340821743011475, 0.3293226361274719, 0.04569263011217117, 0.22091355919837952, 0.10572998225688934, 0.13256709277629852, 0.04569263011217117, 0.07171164453029633, -0.03969341516494751, -0.03390984982252121, 0.19501450657844543, 0.1399935781955719, 0.07368151843547821, 0.18710961937904358, -0.0635440498590469

[932, 0.11878494918346405, 0.07586277276277542, 0.17654365301132202, -0.006778668612241745, 0.017446454614400864, 0.006400919985026121, 0.086273193359375, 0.08880419284105301, 0.05181959643959999, 0.20493006706237793, -0.003446991555392742, -0.008772198110818863, 0.06994984298944473, 0.029692813754081726, 0.1743095964193344, 0.03654997795820236, 0.2225160002708435, 0.00839313305914402, 0.07028043270111084, -0.017379779368638992, 0.05128144472837448, 0.07935508340597153, 0.15295809507369995, 0.09452798217535019, 0.024716027081012726, 0.08499184995889664, -0.01513623632490635, 0.06983785331249237, 0.11513213068246841, -0.03007315658032894, -0.053612541407346725, 0.06646303832530975, -0.04895336180925369, 0.1766682267189026, 0.06276704370975494, 0.05181959643959999, 0.14358624815940857, 0.291368305683136, 0.10781759023666382, 0.05181959643959999, 0.11446529626846313, 0.03304091840982437, 0.03442223370075226, 0.1425997018814087, -0.06295517086982727, 0.0465877391397953, -0.0197773464024066

[937, 0.1976119875907898, 0.022529426962137222, 0.04430708289146423, 0.19332724809646606, -0.02834322303533554, -0.005133012309670448, 0.030329326167702675, -0.012332126498222351, 0.017605578526854515, -0.06329944729804993, 0.08890320360660553, -0.033678486943244934, 0.06052510812878609, 0.1497100591659546, 0.1294393241405487, 0.12826691567897797, 0.050340794026851654, 0.02313455566763878, 0.06443628668785095, 0.19613119959831238, 0.03576710447669029, 0.06767813861370087, 0.08425925672054291, 0.23839621245861053, 0.22604236006736755, 0.06086277961730957, 0.027027739211916924, 0.018314065411686897, 0.05997662991285324, 0.10201789438724518, 0.06907863169908524, -0.01507941260933876, 0.1353175938129425, 0.03785799816250801, 0.02999519370496273, 0.017605578526854515, 0.05380682647228241, -0.021579604595899582, -0.02155718207359314, 0.017605578526854515, 0.012110607698559761, -0.07059241831302643, 0.16748881340026855, 0.1192474514245987, 0.5014763474464417, 0.26833391189575195, 0.1522731781

In [84]:
df

Unnamed: 0,id,862,865,866,862.1,865.1,866.1,867,868,869
0,862.0,1.0,,,,,,,,
1,865.0,0.185027,1.0,,,,,,,
2,866.0,0.184423,0.085743,1.0,,,,,,
3,862.0,1.0,0.185027,0.184423,1.0,,,,,
