In [1]:
# Import Pandas
import pandas as pd
# Import Numpy
import numpy as np
# Import get user profile
from userProfile import get_user_profile
# Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

"""
Since you have used the TF-IDF vectorizer, calculating the dot product will
directly give you the cosine similarity score. Therefore, you will use
sklearn's linear_kernel() instead of cosine_similarities() since it is faster.
"""

import mysql.connector

np.set_printoptions(threshold=np.inf)

mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="cKeecl00",
  database="sys"
)

# Load Movies Metadata as a DataFrame
poems = pd.read_sql("SELECT * FROM poems", mydb)
poems = poems[['poem_id', 'title', 'author',
               'lines', 'linecount', 'wordcount']]

userProfile = get_user_profile()
print(poems.shape)
poems = poems.append(userProfile, ignore_index=True)
print(poems.shape)

(93, 6)
(94, 6)


In [2]:
userProfile

{'author': ['Percy Bysshe Shelley',
  'George Gordon, Lord Byron',
  'Percy Bysshe Shelley',
  'Percy Bysshe Shelley',
  'Percy Bysshe Shelley',
  'Percy Bysshe Shelley',
  'William Blake',
  'George Gordon, Lord Byron',
  'George Gordon, Lord Byron',
  'George Gordon, Lord Byron',
  'Percy Bysshe Shelley',
  'Percy Bysshe Shelley',
  'Percy Bysshe Shelley',
  'George Gordon, Lord Byron',
  'Percy Bysshe Shelley'],
 'linecount': 3.7333333333333334,
 'lines': ['["His face was like a snake\'s--wrinkled and loose", "And withered--"]',
  '["God maddens him whom\'t is his will to lose,", "And gives the choice of death or phrenzy--choose."]',
  '["Rome has fallen, ye see it lying", "Heaped in undistinguished ruin:", "Nature is alone undying."]',
  '["I went into the deserts of dim sleep--", "That world which, like an unknown wilderness,", "Bounds this with its recesses wide and deep--"]',
  '["Hark! the owlet flaps his wings", "In the pathless dell beneath;", "Hark! \'tis the night-raven sin

In [5]:
poems['lines'] = poems['lines'].map(lambda x: ''.join(x))

# Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')
# flatten poem lines into a list of strings

# Replace NaN with an empty string
poems['lines'] = poems['lines'].fillna('')
# Construct the required TF-IDF matrix by fitting and transforming the data

In [7]:
tfidf_matrix = tfidf.fit_transform(poems['lines'])

In [8]:
# Compute the cosine similarity matrix
# Linear kernel used as it's faster
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [20]:
# Construct a reverse map of indices and movie titles
indices = pd.Series(poems.index, index=poems['poem_id']).drop_duplicates()


# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(poem_id, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[poem_id]
    # Get the pairwsie similarity scores of all movies with that moviee
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return poems['title'].iloc[movie_indices]


def to_similarity_matrix(feature):
    'input is an array of numbers, returns a numpy 2d matrix'
    simMatrix = []
    for x in feature:
        row = []
        for y in feature:
            # this math is quite important
            # find difference between two numbers
            # add 1 to prevent / 0 error
            # divide 1 by new number to get scaled similarity
            row.append(1/((max(x, y)-min(x, y))+1))
        simMatrix.append(row)
    return np.array(simMatrix)


# Function to convert all strings to lower case and strip names of spaces
# Hyphens separate authors, and so are replaced with spaces for many authors
def clean_author(x):
    if isinstance(x, list):
        return " ".join(x)
    else:
        return str.lower(x.replace(" ", ""))


poems['author'] = poems['author'].apply(clean_author)

In [22]:
poems['author'][93]

['percybyssheshelley',
 'georgegordon,lordbyron',
 'percybyssheshelley',
 'percybyssheshelley',
 'percybyssheshelley',
 'percybyssheshelley',
 'williamblake',
 'georgegordon,lordbyron',
 'georgegordon,lordbyron',
 'georgegordon,lordbyron',
 'percybyssheshelley',
 'percybyssheshelley',
 'percybyssheshelley',
 'georgegordon,lordbyron',
 'percybyssheshelley']

In [17]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(poems['author'])

AttributeError: 'list' object has no attribute 'lower'

In [12]:
author_sim = cosine_similarity(count_matrix, count_matrix)
# Downweight author significance in vectorizer
author_sim = np.multiply(author_sim, 0.1)

poems = poems.reset_index()
indices = pd.Series(poems.index, index=poems['poem_id']).drop_duplicates()

# Create sim matrix for line count
lines_sim = to_similarity_matrix(poems['linecount'])
# Create sim matrix for word count
wordcount_sim = to_similarity_matrix(poems['wordcount'])
wordcount_sim = np.multiply(wordcount_sim, 0.2)
# Average the two cosine similarities
final_features = [cosine_sim, author_sim, lines_sim, wordcount_sim]
final_sim = np.mean(np.array(final_features), axis=0)

# print(cosine_sim[0])
# print(author_sim[0])
# print(lines_sim[0])
# print(wordcount_sim[0])
# print(final_sim[0])

# print(get_recommendations(102, cosine_sim))
# print(get_recommendations(102, author_sim))
# print(get_recommendations(107, lines_sim))
# print(get_recommendations(107, wordcount_sim))
# print(get_recommendations(102, final_sim))
print(get_recommendations(1, final_sim))

AttributeError: 'list' object has no attribute 'lower'