* dataset: https://grouplens.org/datasets/movielens/100k/

In [8]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD

In [3]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('./dataset/u.data', sep='\t', names=columns)
print(df.shape)
df.head()

(100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
columns = [
    'item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 
    'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
    'Sci-Fi', 'Thriller', 'War', 'Western'
]

movies = pd.read_csv('./dataset/u.item', sep='|', names=columns, encoding='latin-1')

movie_names = movies[['item_id', 'movie title']]
c_movies_data = pd.merge(df, movie_names, on='item_id')
print(c_movies_data.shape)
c_movies_data.head()

(100000, 5)


Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [6]:
rating_crosstab = c_movies_data.pivot_table(values='rating', index='user_id', columns='movie title', fill_value=0)
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0


In [7]:
X = rating_crosstab.T
print(X.shape)

(1664, 943)


In [9]:
svd = TruncatedSVD(n_components=12, random_state=5)
resultant_matrix = svd.fit_transform(X)
resultant_matrix.shape

(1664, 12)

In [10]:
corr_mat = np.corrcoef(resultant_matrix)
print(corr_mat.shape)
corr_mat

(1664, 1664)


array([[ 1.        , -0.11573577,  0.51362284, ...,  0.38310045,
         0.20193733,  0.5065142 ],
       [-0.11573577,  1.        ,  0.05820808, ...,  0.15805829,
         0.51795357,  0.27104818],
       [ 0.51362284,  0.05820808,  1.        , ...,  0.76575655,
         0.43824619,  0.19507139],
       ...,
       [ 0.38310045,  0.15805829,  0.76575655, ...,  1.        ,
         0.18043708,  0.12115972],
       [ 0.20193733,  0.51795357,  0.43824619, ...,  0.18043708,
         1.        ,  0.20126072],
       [ 0.5065142 ,  0.27104818,  0.19507139, ...,  0.12115972,
         0.20126072,  1.        ]])

In [11]:
rating_crosstab.columns.get_loc('Star Wars (1977)')

1398

In [12]:
col_idx = rating_crosstab.columns.get_loc('Star Wars (1977)')
corr_specific = corr_mat[col_idx]
print(corr_specific.shape)

(1664,)


In [13]:
result = pd.DataFrame({'corr_specific': corr_specific, 'Movies': rating_crosstab.columns})
print(result.shape)
result.head()

(1664, 2)


Unnamed: 0,corr_specific,Movies
0,0.357238,'Til There Was You (1997)
1,0.421507,1-900 (1994)
2,0.593815,101 Dalmatians (1996)
3,0.722361,12 Angry Men (1957)
4,0.325221,187 (1997)


In [14]:
result.sort_values('corr_specific', ascending=False).head(10)

Unnamed: 0,corr_specific,Movies
1398,1.0,Star Wars (1977)
1234,0.988052,Return of the Jedi (1983)
1460,0.942655,Terminator 2: Judgment Day (1991)
1523,0.933978,Toy Story (1995)
1461,0.931701,"Terminator, The (1984)"
1205,0.925185,Raiders of the Lost Ark (1981)
456,0.923562,"Empire Strikes Back, The (1980)"
570,0.915965,"Fugitive, The (1993)"
414,0.914299,Die Hard (1988)
44,0.892894,Aliens (1986)


In [15]:
rating_crosstab.columns.get_loc('Young Guns II (1990)')

1659

In [16]:
col_idx = rating_crosstab.columns.get_loc('Young Guns II (1990)')
corr_specific = corr_mat[col_idx]
print(corr_specific.shape)

(1664,)


In [17]:
result = pd.DataFrame({'corr_specific': corr_specific, 'Movies': rating_crosstab.columns})
print(result.shape)
result.head()

(1664, 2)


Unnamed: 0,corr_specific,Movies
0,0.44775,'Til There Was You (1997)
1,0.227687,1-900 (1994)
2,0.563967,101 Dalmatians (1996)
3,0.411437,12 Angry Men (1957)
4,0.364059,187 (1997)


In [18]:
result.sort_values('corr_specific', ascending=False).head(21)

Unnamed: 0,corr_specific,Movies
1659,1.0,Young Guns II (1990)
1658,0.969005,Young Guns (1988)
355,0.958561,"Crow, The (1994)"
406,0.947148,Desperado (1995)
17,0.945939,Ace Ventura: Pet Detective (1994)
402,0.93443,Demolition Man (1993)
994,0.932663,Money Train (1995)
1586,0.930606,Virtuosity (1995)
1344,0.930541,Sliver (1993)
593,0.929902,"Getaway, The (1994)"


In [19]:
rating_crosstab.columns.get_loc('Yankee Zulu (1994)')

1654

In [20]:
col_idx = rating_crosstab.columns.get_loc('Yankee Zulu (1994)')
corr_specific = corr_mat[col_idx]
print(corr_specific.shape)

(1664,)


In [21]:
result = pd.DataFrame({'corr_specific': corr_specific, 'Movies': rating_crosstab.columns})
print(result.shape)
result.head()

(1664, 2)


Unnamed: 0,corr_specific,Movies
0,0.366853,'Til There Was You (1997)
1,0.231278,1-900 (1994)
2,0.236764,101 Dalmatians (1996)
3,0.373489,12 Angry Men (1957)
4,0.273117,187 (1997)


In [22]:
result.sort_values('corr_specific', ascending=False).head(31)

Unnamed: 0,corr_specific,Movies
756,1.0,"Invitation, The (Zaproszenie) (1986)"
727,1.0,"I, Worst of All (Yo, la peor de todas) (1990)"
861,1.0,Liebelei (1933)
265,1.0,Careful (1992)
1503,1.0,To Cross the Rubicon (1991)
1144,1.0,Pharaoh's Army (1995)
1468,1.0,The Courtyard (1995)
1186,1.0,"Promise, The (Versprechen, Das) (1994)"
714,1.0,"Hungarian Fairy Tale, A (1987)"
1195,1.0,Quartier Mozart (1992)
