In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

# [3월 24일]
---

In [3]:
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')

display(movies.head())
display(ratings.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## # 데이터 전처리
---

In [4]:
rt = pd.pivot(ratings, index = 'userId', columns = 'movieId', values = 'rating')
rt

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [8]:
ratings_movies = pd.merge(ratings, movies, on = ['movieId']).drop('timestamp', axis = 1)
ratings_matrix = ratings_movies.pivot_table('rating', index = 'userId', columns = 'title').fillna(0)
ratings_matrix_T = ratings_matrix.fillna(0).T

ratings_matrix_T.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## # 영화 간 유사도 산출
---
- cosine_similarity 함수는 행을 기준으로 서로 다른 행을 비교해 유사도를 산출한다.
- 영화를 기준으로 cosine_similarity를 적용하면 현재의 ratings_matrix가 행 기준이 되고 열 기준이 사용자가 된다.

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
item_sim.shape

(9719, 9719)

In [11]:
item_sim_df = pd.DataFrame(item_sim, index = ratings_matrix.columns, columns = ratings_matrix.columns)
item_sim_df.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,1.0,0.857493,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.857493,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# 다른 영화와 유사도 확인

item_sim_df['Godfather, The (1972)'].sort_values(ascending = False)[:15]

title
Godfather, The (1972)                                    1.000000
Godfather: Part II, The (1974)                           0.821773
Goodfellas (1990)                                        0.664841
One Flew Over the Cuckoo's Nest (1975)                   0.620536
Star Wars: Episode IV - A New Hope (1977)                0.595317
Fargo (1996)                                             0.588614
Star Wars: Episode V - The Empire Strikes Back (1980)    0.586030
Fight Club (1999)                                        0.581279
Reservoir Dogs (1992)                                    0.579059
Pulp Fiction (1994)                                      0.575270
American Beauty (1999)                                   0.575012
Usual Suspects, The (1995)                               0.574569
Matrix, The (1999)                                       0.570998
Star Wars: Episode VI - Return of the Jedi (1983)        0.567904
Saving Private Ryan (1998)                               0.560505
Name

## # 개인화된 영화 추천
---
- 개인화된 예측 평점 : $\hat{R}_{u, i} = \Sigma^N(S_{i,N} * R_{u,N})\ /\ \Sigma^N(|S_{i,N}|)$
- $\hat{R}_{u, i}$ : 사용자 u, 아이템 i의 개인화된 예측 평점 값
- $S_{i,N}$ : 아이템 i와 가장 유사도가 높은 Top_N개 아이템의 유사도 벡터
- $R_{u,N}$ : 사용자 u의 아이템 i와 가장 유사도가 높은 Top_N개 아이템에 대한 실제 평점 벡터

In [38]:
# 개인화 예측 평점 사용자 함수

def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = np.dot(ratings_arr, item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis = 0)])
    return ratings_pred

ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)
ratings_pred_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.070345,0.577855,0.321696,0.227055,0.206958,0.194615,0.249883,0.102542,0.157084,0.178197,...,0.113608,0.181738,0.133962,0.128574,0.006179,0.212070,0.192921,0.136024,0.292955,0.720347
2,0.018260,0.042744,0.018861,0.000000,0.000000,0.035995,0.013413,0.002314,0.032213,0.014863,...,0.015640,0.020855,0.020119,0.015745,0.049983,0.014876,0.021616,0.024528,0.017563,0.000000
3,0.011884,0.030279,0.064437,0.003762,0.003749,0.002722,0.014625,0.002085,0.005666,0.006272,...,0.006923,0.011665,0.011800,0.012225,0.000000,0.008194,0.007017,0.009229,0.010420,0.084501
4,0.049145,0.277628,0.160448,0.206892,0.309632,0.042337,0.130048,0.116442,0.099785,0.097432,...,0.051269,0.076051,0.055563,0.054137,0.008343,0.159242,0.100941,0.062253,0.146054,0.231187
5,0.007278,0.066951,0.041879,0.013880,0.024842,0.018240,0.026405,0.018673,0.021591,0.018841,...,0.009689,0.022246,0.013360,0.012378,0.000000,0.025839,0.023712,0.018012,0.028133,0.052315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.246832,1.293006,0.752661,0.935924,1.032354,0.407038,0.516819,0.594590,0.489913,0.408661,...,0.227092,0.405114,0.291452,0.276143,0.055006,0.636525,0.510522,0.346652,0.550174,0.893777
607,0.052248,0.305255,0.180669,0.218834,0.179443,0.115288,0.165817,0.075548,0.104890,0.109500,...,0.059516,0.135763,0.093843,0.086643,0.003707,0.144222,0.134705,0.107674,0.156614,0.576894
608,0.324435,1.022541,0.598467,0.425468,0.349562,0.494081,0.529903,0.227746,0.480980,0.442384,...,0.276586,0.594918,0.457094,0.444436,0.038681,0.616733,0.717768,0.538586,0.527639,0.698871
609,0.004835,0.053593,0.026251,0.000000,0.002827,0.015528,0.017849,0.007791,0.013172,0.014981,...,0.006575,0.014368,0.010334,0.007742,0.000000,0.018070,0.015600,0.013108,0.018328,0.033377


In [37]:
from sklearn.metrics import mean_squared_error

# 사용자가 평점을 부여한 영화에 대해서만 예측 성능 평가

def get_mse(pred, actual):
    
    # 평점이 있는 영화만 추출
    pred = pred[actual.nonzero()]
    actual = actual[actual.nonzero()]
    return mean_squared_error(pred, actual)

print('MSE : ', get_mse(ratings_pred, ratings_matrix.values))

MSE :  9.895354759094706


In [265]:
# 영화 유사도 Top_N 안에서 영화를 선택하여 개인화 예측 평점을 만드는 사용자 함수

def predict_rating_topsim(ratings_arr, item_sim_arr, n = 20):
    
    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred = np.zeros(ratings_arr.shape)
    
    # 사용자-아이템 평점 행렬의 열 크기만큼 루프 수행
    for col in range(ratings_arr.shape[1]):
        
        # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 인덱스 반환
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        
        # 개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_tems].dot(ratings_arr[row, :][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
        
    return pred


ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n = 20)
print('Top_20 이웃 MSE : ', get_mse(ratings_pred, ratings_matrix.values))

  pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T)
  pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))


Top_20 이웃 MSE :  3.694957479362603


In [278]:
ratings_pred_matrix_20 = pd.DataFrame(ratings_pred, index = ratings_matrix.index, columns = ratings_matrix.columns)
ratings_pred_matrix_20

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.220798,0.000000,0.000000,1.677291,0.284372
2,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.220798,0.000000,0.000000,0.194828,0.000000
5,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.149633,0.0,0.418273,0.16678,0.0,0.130033,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.344930,0.268465,0.000000,0.694944,0.189602
607,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.194948,0.000000,0.000000,0.000000,0.000000
608,0.0,0.000000,0.0,0.159451,0.00000,0.0,0.243703,0.0,0.000000,0.0,...,0.0,0.129289,0.000000,0.112856,0.0,1.587302,2.988072,0.175489,0.702430,0.000000
609,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000


In [290]:
# 9번 사용자가 좋아하는 영화 Top_10

user_rating_id = ratings_matrix.loc[9, :]
user_rating_id[user_rating_id > 0].sort_values(ascending = False)

title
Adaptation (2002)                                                                 5.0
Citizen Kane (1941)                                                               5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.0
Producers, The (1968)                                                             5.0
Lord of the Rings: The Two Towers, The (2002)                                     5.0
Lord of the Rings: The Fellowship of the Ring, The (2001)                         5.0
Back to the Future (1985)                                                         5.0
Austin Powers in Goldmember (2002)                                                5.0
Minority Report (2002)                                                            4.0
Witness (1985)                                                                    4.0
Pumpkin (2002)                                                                    4.0
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)   

In [294]:
# 사용자가 이미 평점을 준 영화를 리스트로 반환하는 사용자 함수

def get_unseen_movies(ratings_matrix, userId):
    
    # userId로 입력받은 사용자의 모든 영화 정보를 추출해 Series로 반환함
    # 반환된 user_rating은 영화명(title)을 인덱스로 가지는 Series 객체임
    user_rating = ratings_matrix.loc[userId, :]
    
    # 기존 관람한 영화 인덱스를 추출해 list 만들기
    already_seen = user_rating[user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듦
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 영화는 movies_list에서 제외함.
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [301]:
# 사용자에게 영화를 추천하는 사용자 함수

def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n = 10):
    
    # 예측 평점 DataFrame에서 사용자id 인덱스와 unseen_list로 들어온 영화명 칼럼을 추출해
    # 가장 예측 평점이 높은 순으로 정렬함
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending = False)[:top_n]
    return recomm_movies

# 사용자가 관람하지 않는 영화명 추출
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 아이템 기반의 최근접 이웃 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix_20, 9, unseen_list, 10)

# 평점 데이터를 DataFrame으로 생성
recomm_movies = pd.DataFrame(data = recomm_movies.values, index = recomm_movies.index, columns = ['pred_score'])

recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Shrek (2001),0.866202
Spider-Man (2002),0.857854
"Last Samurai, The (2003)",0.817473
Indiana Jones and the Temple of Doom (1984),0.816626
"Matrix Reloaded, The (2003)",0.80099
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001),0.765159
Gladiator (2000),0.740956
"Matrix, The (1999)",0.732693
Pirates of the Caribbean: The Curse of the Black Pearl (2003),0.689591
"Lord of the Rings: The Return of the King, The (2003)",0.676711


## # 실험
---

In [191]:
ratings_pred[ratings_matrix.values.nonzero()]

array([0.2855597 , 1.08359021, 0.35404974, ..., 2.57350896, 1.08329872,
       1.81609065])

In [189]:
ratings_matrix.values.nonzero()

(array([  0,   0,   0, ..., 609, 609, 609], dtype=int64),
 array([  48,   66,  202, ..., 9712, 9715, 9716], dtype=int64))

In [200]:
a = ((1, 2, 3), (2, 2, 3))
ratings_pred[a]

array([0.01886104, 0.06443729, 0.20689206])

In [276]:
a = np.array(np.arange(1, 10)).reshape(3, -1)
b = np.array(np.arange(1, 10)).reshape(3, -1)

b / a.sum(axis = 1)

array([[0.16666667, 0.13333333, 0.125     ],
       [0.66666667, 0.33333333, 0.25      ],
       [1.16666667, 0.53333333, 0.375     ]])

In [277]:
b

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [217]:
np.zeros(a.shape)

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [156]:
a = np.array(np.arange(4)).reshape(2, -1)
a.nonzero()

(array([0, 1, 1], dtype=int64), array([1, 0, 1], dtype=int64))

In [256]:
a = np.array(np.arange(9)).reshape(3, -1)
np.argsort(a), a

(array([[0, 1, 2],
        [0, 1, 2],
        [0, 1, 2]], dtype=int64),
 array([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]]))

In [82]:
a = np.dot(item_sim_df, ratings_matrix.T) / np.array([np.abs(item_sim_df).sum(axis = 0)]).reshape(-1, 1)
pd.DataFrame(a, index = ratings_matrix.columns, columns = ratings_matrix.index)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.070345,0.018260,0.011884,0.049145,0.007278,0.022967,0.062503,0.009577,0.016342,0.044189,...,0.067609,0.020523,0.200311,0.014261,0.049955,0.246832,0.052248,0.324435,0.004835,3.628303
'Hellboy': The Seeds of Creation (2004),0.577855,0.042744,0.030279,0.277628,0.066951,0.122637,0.372868,0.084303,0.081805,0.155954,...,0.322554,0.140291,1.091501,0.078915,0.248621,1.293006,0.305255,1.022541,0.053593,1.517918
'Round Midnight (1986),0.321696,0.018861,0.064437,0.160448,0.041879,0.071967,0.198837,0.047613,0.043044,0.075501,...,0.168266,0.076853,0.658284,0.056234,0.184790,0.752661,0.180669,0.598467,0.026251,0.833668
'Salem's Lot (2004),0.227055,0.000000,0.003762,0.206892,0.013880,0.188898,0.034989,0.027602,0.039426,0.106136,...,0.041925,0.032581,1.106965,0.023261,0.176035,0.935924,0.218834,0.425468,0.000000,0.399443
'Til There Was You (1997),0.206958,0.000000,0.003749,0.309632,0.024842,0.222312,0.046235,0.043137,0.026811,0.066934,...,0.047591,0.076592,1.045288,0.044582,0.126892,1.032354,0.179443,0.349562,0.002827,0.310888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.212070,0.014876,0.008194,0.159242,0.025839,0.118870,0.102175,0.027063,0.029529,0.041142,...,0.072462,0.072583,0.812802,0.045953,0.091977,0.636525,0.144222,0.616733,0.018070,0.628323
xXx (2002),0.192921,0.021616,0.007017,0.100941,0.023712,0.120876,0.136675,0.030581,0.032399,0.083420,...,0.100815,0.068403,0.359353,0.046173,0.132415,0.510522,0.134705,0.717768,0.015600,1.083299
xXx: State of the Union (2005),0.136024,0.024528,0.009229,0.062253,0.018012,0.080545,0.127286,0.024177,0.021871,0.081787,...,0.091581,0.043328,0.212603,0.036240,0.130186,0.346652,0.107674,0.538586,0.013108,1.816091
¡Three Amigos! (1986),0.292955,0.017563,0.010420,0.146054,0.028133,0.152925,0.098902,0.033878,0.027114,0.055053,...,0.078548,0.077949,0.492395,0.047133,0.111025,0.550174,0.156614,0.527639,0.018328,0.565075


In [81]:
np.array([np.abs(item_sim_df).sum(axis = 1)]).reshape(-1, 1)

array([[508.07109734],
       [ 45.47134351],
       [ 72.86766512],
       ...,
       [522.73875081],
       [914.30133794],
       [ 41.71929155]])

In [80]:
np.array([np.abs(item_sim_df["'71 (2014)"]).sum(axis = 0)]).reshape(-1, 1)

array([[508.07109734]])