In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/melon_song.csv')
df.head(1)

Unnamed: 0,songId,title,artist,genre,album,lyricist,composer,date,img,comment,like,lyric,plylstSeq
0,418168,희재,성시경,발라드 국내영화,국화꽃 향기 OST,양재선,MGR,20030201.0,https://cdnimg.melon.co.kr/cm/album/images/000...,332,138267,햇살은 우릴 위해 내리고 \n바람도 서롤 감싸게 했죠 \n우리 웃음속에 계절은 오고...,445029956 411111859


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6247 entries, 0 to 6246
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   songId     6247 non-null   int64  
 1   title      6247 non-null   object 
 2   artist     6247 non-null   object 
 3   genre      6245 non-null   object 
 4   album      6247 non-null   object 
 5   lyricist   5849 non-null   object 
 6   composer   5845 non-null   object 
 7   date       6202 non-null   float64
 8   img        6247 non-null   object 
 9   comment    6247 non-null   int64  
 10  like       6247 non-null   int64  
 11  lyric      6247 non-null   object 
 12  plylstSeq  6126 non-null   object 
dtypes: float64(1), int64(3), object(9)
memory usage: 634.6+ KB


In [4]:
df.fillna('', inplace=True)

In [5]:
df['comment_like_total'] = df.comment + df.like

In [6]:
df['songId'] = df.songId.astype(str)

### 형태소 분석과 불용어 처리

In [7]:
with open('data/한글불용어.txt') as st:
    lines = st.readlines()
stop_words = [line.split('\t')[0] for line in lines]
stop_words.extend('은 는 를 도 을 며 의 에 게 니 거 로 요 과 래 랑 파 여 에게'.split())

In [8]:
from konlpy.tag import Okt
okt = Okt()

In [6]:
df['total'] = df.lyric + (' ' + df.title) + (' ' + df.artist) * 2 + (' ' + df.composer) * 2 + (' ' + df.lyricist) * 2 + (' ' + df.genre) * 2

In [7]:
df.set_index('songId', inplace=True)
df.reset_index(inplace=True)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer()
total_tv = tvect.fit_transform(df.total)
indices = pd.Series(df.index, index=df.songId)

In [19]:
indices

songId
418168         0
35609035       1
1046278        2
35609034       3
35728845       4
            ... 
33013736    6242
3080890     6243
2981597     6244
34183979    6245
31191637    6246
Length: 6247, dtype: int64

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(total_tv)

In [10]:
def get_recommendation(songId, cos_sim):
    index = indices[songId]
    sim_scores = pd.Series(cos_sim[index])
    song_indices = sim_scores.sort_values(ascending=False).head(30).index
    return df.songId.iloc[song_indices]

In [11]:
sorted_numbers = np.sort(df['comment_like_total'])
sorted_numbers

array([     1,      2,      2, ..., 457930, 471132, 556783], dtype=int64)

In [12]:
q1 = np.percentile(sorted_numbers, 25) # 25번째 백분위수를 계산하여 반환
q1

1410.5

In [13]:
q2 = np.percentile(sorted_numbers, 50)
q2

11876.0

In [None]:
# 25번째 백분위보다 크고 50번째 백분위보다 작은 songlist
filtered_data = df[(df['comment_like_total'] >= q1) & (df['comment_like_total'] < q2)]
filtered_data

In [30]:
get_recommendation('35609035', cosine_sim)

1       35609035
3       35609034
3673    35307770
3201     4381724
1593     1925230
5691    32323332
350     35667692
3684    35466599
3620    35690912
5879    32906337
5510    32906335
5341     8120284
3582    36065180
3712    35466609
5566    32457756
1078     4185941
2340       58506
1640       54319
2116     1053842
1588     1139812
5822     4260411
1131    33461694
5809    36132849
2039      607397
956     33507532
3356    30849733
874     33435717
5837     3592201
2858     1099824
3739    30225056
Name: songId, dtype: object

In [17]:
[i for i in a.values if i in filtered_data.songId.values]

['4185941',
 '58506',
 '1053842',
 '4260411',
 '33461694',
 '33507532',
 '33435717',
 '3592201',
 '1099824']

In [15]:
a = get_recommendation('35609035', cosine_sim)
b = [i for i in a.values if i in filtered_data.songId.values]
filtered = df[df['songId'].isin(b[:5])].to_dict()

In [18]:
filtered

{'songId': {1078: '4185941',
  1131: '33461694',
  2116: '1053842',
  2340: '58506',
  5822: '4260411'},
 'title': {1078: 'I Was Born To Love You + Only Love (Reprise) (너 하나만 + 사랑이야 Reprise)',
  1131: '태양이 지면 널 만나러 갈게',
  2116: '기분 좋은 날',
  2340: '널 보낸 후에',
  5822: '토요일 밤에'},
 'artist': {1078: '임태경', 1131: '백호', 2116: '김완선', 2340: '최재훈', 5822: '김혜연'},
 'genre': {1078: '국내뮤지컬',
  1131: '국내뮤지컬',
  2116: '댄스',
  2340: '발라드',
  5822: '성인가요/트로트'},
 'album': {1078: 'The Last Kiss (뮤지컬 황태자 루돌프 OST 중 하이라이트)',
  1131: '태양의 노래 Part.3',
  2116: 'Kim Wan Sun Vol 4',
  2340: '외면 (Ignore)',
  5822: '최고다 김혜연 Best Of The Best'},
 'lyricist': {1078: '', 1131: '', 2116: '이남우', 2340: '김혜선', 5822: '서판석'},
 'composer': {1078: '', 1131: '', 2116: '박청귀', 2340: '김형준', 5822: '민병훈'},
 'date': {1078: 20130726.0,
  1131: 20210510.0,
  2116: 19890610.0,
  2340: 19940701.0,
  5822: 20130901.0},
 'img': {1078: 'https://cdnimg.melon.co.kr/cm/album/images/021/94/425/2194425_500.jpg/melon/resize/282/quality/80/optimize

In [None]:
b = []
for i in a.values:
    if i in filtered_data.songId.values:
        b.append(i)
        

In [22]:
# 코사인 유사도
from sklearn.metrics.pairwise import linear_kernel
cosine_sim_total = linear_kernel(total_tv, total_tv)

In [23]:
cosine_sim_total[indices['35609035']]

array([0.01298834, 1.        , 0.02440919, ..., 0.00454732, 0.00621814,
       0.        ])

In [24]:
sim_scores = pd.Series(cosine_sim_total[indices['35609035']])
sim_scores

0       0.012988
1       1.000000
2       0.024409
3       0.280331
4       0.006050
          ...   
6242    0.012624
6243    0.000000
6244    0.004547
6245    0.006218
6246    0.000000
Length: 6247, dtype: float64

In [29]:
# 나를 제외한 Top 10
sim_scores.sort_values(ascending=False).head(11).tail(10)

3       0.280331
3673    0.229067
3201    0.103719
1593    0.101392
5691    0.094305
350     0.087265
3684    0.085459
3620    0.081659
5879    0.075554
5510    0.072076
dtype: float64