In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading ratings file
# Ignore the timestamp column
ratings = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'movie_id', 'rating'])

# Reading users file
users = pd.read_csv('users.csv', sep='\t', encoding='latin-1', usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [5]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')

In [6]:
movies.head(3)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),"['Animation', ""Children's"", 'Comedy']"
1,2,Jumanji (1995),"['Adventure', ""Children's"", 'Fantasy']"
2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

(3883, 127)

In [34]:
print(tf.get_feature_names()[:3])
len(tf.get_feature_names())

['action', 'action adventure', 'action animation']


127

In [33]:
matrix = pd.DataFrame(tfidf_matrix.todense(),columns=tf.get_feature_names())
matrix.head(3)

Unnamed: 0,action,action adventure,action animation,action children,action comedy,action crime,action drama,action horror,action mystery,action romance,...,romance war,romance western,sci,sci fi,thriller,thriller war,thriller western,war,war western,western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
[i for i in range(len(matrix.head(1).values)) if matrix.head(1).values[i] > 0]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [61]:
matrix.head(1).columns

Index(['action', 'action adventure', 'action animation', 'action children',
       'action comedy', 'action crime', 'action drama', 'action horror',
       'action mystery', 'action romance',
       ...
       'romance war', 'romance western', 'sci', 'sci fi', 'thriller',
       'thriller war', 'thriller western', 'war', 'war western', 'western'],
      dtype='object', length=127)

In [59]:
[col for col in matrix.head(1).columns if matrix.head(1).]

In [60]:
df2

Unnamed: 0,action,action adventure,action animation,action children,action comedy,action crime,action drama,action horror,action mystery,action romance,...,romance war,romance western,sci,sci fi,thriller,thriller war,thriller western,war,war western,western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
type(matrix.head(1))

pandas.core.frame.DataFrame

In [25]:

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.shape)

pd.DataFrame(X.todense(),columns=vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
(4, 9)


Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085
1,0.0,0.687624,0.0,0.281089,0.0,0.538648,0.281089,0.0,0.281089
2,0.511849,0.0,0.0,0.267104,0.511849,0.0,0.267104,0.511849,0.267104
3,0.0,0.469791,0.580286,0.384085,0.0,0.0,0.384085,0.0,0.384085


In [None]:
pd.DataFrame()

In [22]:
type(X)

scipy.sparse.csr.csr_matrix

In [28]:
vectorizer1 = TfidfVectorizer(norm=None, smooth_idf=False)
X1 = vectorizer1.fit_transform(corpus)
print(vectorizer1.get_feature_names())
print(X1.shape)
pd.DataFrame(X1.todense(),columns=vectorizer1.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
(4, 9)


Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0.0,1.287682,1.693147,1.0,0.0,0.0,1.0,0.0,1.0
1,0.0,2.575364,0.0,1.0,0.0,2.386294,1.0,0.0,1.0
2,2.386294,0.0,0.0,1.0,2.386294,0.0,1.0,2.386294,1.0
3,0.0,1.287682,1.693147,1.0,0.0,0.0,1.0,0.0,1.0


In [29]:
vectorizer1.idf_

array([2.38629436, 1.28768207, 1.69314718, 1.        , 2.38629436,
       2.38629436, 1.        , 2.38629436, 1.        ])

In [66]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.14193614, 0.09010857, 0.1056164 ],
       [0.14193614, 1.        , 0.        , 0.        ],
       [0.09010857, 0.        , 1.        , 0.1719888 ],
       [0.1056164 , 0.        , 0.1719888 , 1.        ]])

In [67]:
cosine_sim.shape

(3883, 3883)