In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests

plt.rcParams['font.family'] = 'Malgun Gothic'

In [3]:
data = np.array([[1, 7, 2], [1, 2, 4], [0, 8, 3], [2, 0, 3]])
data_df = pd.DataFrame(data)

In [4]:
data_df

Unnamed: 0,0,1,2
0,1,7,2
1,1,2,4
2,0,8,3
3,2,0,3


# 코사인 유사도

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

data_sim = cosine_similarity(data_df, data_df)
print(data_sim)

[[1.         0.68300095 0.98749153 0.30194054]
 [0.68300095 1.         0.71513322 0.84731855]
 [0.98749153 0.71513322 1.         0.29215236]
 [0.30194054 0.84731855 0.29215236 1.        ]]


In [7]:
print(data_sim.argsort()[:, ::-1])

[[0 2 1 3]
 [1 3 2 0]
 [2 0 1 3]
 [3 1 0 2]]


In [8]:
data_sim[0, 1] = 0.22
data_sim[1, 0] = 0.22
data_sim

array([[1.        , 0.22      , 0.98749153, 0.30194054],
       [0.22      , 1.        , 0.71513322, 0.84731855],
       [0.98749153, 0.71513322, 1.        , 0.29215236],
       [0.30194054, 0.84731855, 0.29215236, 1.        ]])

In [9]:
print(data_sim.argsort()[:, ::-1])

[[0 2 3 1]
 [1 3 2 0]
 [2 0 1 3]
 [3 1 0 2]]


In [20]:
user = [1, 7, 2]
user_df = np.array(user).reshape(1, 3)
user_sim = cosine_similarity(pd.DataFrame(user_df), data_df)
print(user_sim.argsort()[:, ::-1])

[[0 2 1 3]]


# 유클리디안 거리

In [22]:
from sklearn.metrics.pairwise import euclidean_distances

data_sim_d = euclidean_distances(data_df, data_df)
print(data_sim_d)

[[0.         5.38516481 1.73205081 7.14142843]
 [5.38516481 0.         6.164414   2.44948974]
 [1.73205081 6.164414   0.         8.24621125]
 [7.14142843 2.44948974 8.24621125 0.        ]]


In [23]:
print(data_sim_d.argsort())

[[0 2 1 3]
 [1 3 0 2]
 [2 0 1 3]
 [3 1 0 2]]


# DB에서 불러오기

In [26]:
import requests
import pandas as pd

res = pd.DataFrame(requests.get('http://127.0.0.1:8000/api/corporates/1/similarcorp/').json())
res = res[['E_rating', 'S_rating', 'G_rating']]
res

ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=8000): Max retries exceeded with url: /api/corporates/1/similarcorp/ (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001E2F0733400>: Failed to establish a new connection: [WinError 10061] 대상 컴퓨터에서 연결을 거부했으므로 연결하지 못했습니다'))

In [11]:
from sklearn.metrics.pairwise import euclidean_distances

data_sim = euclidean_distances(res, res)
data_sim

array([[ 0.        ,  8.73762045, 17.4462324 , ..., 81.73538714,
        82.80422978, 78.41659518],
       [ 8.73762045,  0.        , 19.88571689, ..., 78.31332291,
        81.07222474, 76.15560744],
       [17.4462324 , 19.88571689,  0.        , ..., 90.8909915 ,
        92.04180813, 89.56288978],
       ...,
       [81.73538714, 78.31332291, 90.8909915 , ...,  0.        ,
        16.58304363, 15.09230856],
       [82.80422978, 81.07222474, 92.04180813, ..., 16.58304363,
         0.        , 13.71546582],
       [78.41659518, 76.15560744, 89.56288978, ..., 15.09230856,
        13.71546582,  0.        ]])

In [24]:
data_sim.argsort()

array([[  0,  26,   1, ..., 179, 161, 145],
       [  1,  69,  42, ..., 179, 145, 161],
       [  2,  22, 125, ..., 179, 161, 145],
       ...,
       [197, 178, 133, ...,  32,  30, 157],
       [198, 183, 185, ...,  19,  32,  99],
       [199, 142, 162, ..., 157,  99, 119]], dtype=int64)

In [21]:
data_sim.argsort()[:,1:4]

array([[ 26,   1,  17],
       [ 69,  42,  26],
       [ 22, 125,  27],
       [163, 106,  82],
       [170, 187,  51],
       [ 58,  42,   1],
       [188,  57,  78],
       [ 12, 121, 117],
       [117, 121, 190],
       [193,  16, 149],
       [ 53, 146,  86],
       [ 47, 175, 173],
       [  7, 121,  59],
       [105, 111, 109],
       [ 63,  77, 167],
       [126, 143, 152],
       [149, 122,   9],
       [ 26,  69,   1],
       [ 29,  44, 120],
       [ 29, 120,  99],
       [ 56,  41,  73],
       [ 96,  86,  39],
       [ 27,   2, 125],
       [167, 195,  36],
       [  9, 193, 122],
       [192, 119, 148],
       [ 17,   0,   1],
       [ 22, 117,   2],
       [113, 106, 153],
       [ 18,  19, 120],
       [  3,   0,  72],
       [ 43, 138, 168],
       [ 99,  41,  42],
       [ 36, 198, 183],
       [ 71, 123,  80],
       [ 37, 125,  72],
       [ 33, 198, 183],
       [ 72,  35, 125],
       [137, 146, 115],
       [ 50, 146,  92],
       [ 80,  98,  55],
       [ 37,  20

In [16]:
type(data_sim)

numpy.ndarray

In [31]:
sim = pd.DataFrame(data_sim.argsort()[:,1:4], columns=['first', 'second', 'third'])
sim + 1

Unnamed: 0,first,second,third
0,27,2,18
1,70,43,27
2,23,126,28
3,164,107,83
4,171,188,52
...,...,...,...
195,24,168,161
196,58,83,167
197,179,134,170
198,184,186,34


In [30]:
res = requests.get('http://127.0.0.1:8000/api/corporates/similarity/').json()
res

{'id': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  1

In [5]:
res = pd.DataFrame(requests.get('http://127.0.0.1:8000/api/corporates/similarity/').json())
res

Unnamed: 0,first,second,third
0,27,2,18
1,70,43,27
2,23,126,28
3,164,107,83
4,171,188,52
...,...,...,...
195,24,168,161
196,58,83,167
197,179,134,170
198,184,186,34
