Final Code

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

'''
This cell reads in the data needed for the model. The two files needed are the combined data files with the 
customer ratings and the movie titles files in order for the model to print out recommended movies.

Then once the data is read in, we put it in a pandas dataframe for it to be easier to work with.
'''

mov_titles = pd.read_csv('movie_titles.csv', header = None, encoding = "ISO-8859-1")

mov_titles = mov_titles.drop(columns=[1,3,4,5], axis = 1)
mov_titles = mov_titles.rename(columns = {0:'movie_id',2:'movie_title'})

                
list_1 = []
with open('combined_data_1.txt', 'r') as f:
        for line in f:
            splitLine = line.split('/t')
            for item in splitLine:
                list_1.append(splitLine)
                
# Then we needed a way to loop through and associate movie id with each record. So we append it back to the list_1
for x in list_1:
    for i in x:
        if ':' in i:
            a = len(i)
            y2 = i[0:a]
            y = y2.replace(":", "")
    x.append(y)


# In this section we want to take everything in list_1 and split out the customer id, rating, and date better.
keys = ['customer_id','customer_rating','date','movie_id']
newList=[]

for x in list_1:
    movie_id = x[1]
    y = x[0]
    d = y.split(',')
    d.append(movie_id)
    newList.append(d)

# Now that we have the structure by customer, how they rated the movie and all that jazz. 
# We need to get rid of the values in the list that are just the movie numbers.
values = []
for x in newList:
    if len(x)==4:
        values.append(x)
        
# Finally we can put it into a dataframe and start looking at our data.  
df = pd.DataFrame(values, columns=keys)
df = df.replace('\n','', regex=True)
df['date'] = df['date'].astype('datetime64[ns]')
df['customer_rating'] = df['customer_rating'].astype('float')

In [2]:
'''
In this cell, we do a left join of the ratings file and the movie titles file to replace movie id with the title of the movie.
we will use the df3 dataframe later in the model to output movie titles.
'''

df_3 = df.join(mov_titles, lsuffix='movie_id', rsuffix='movie_id')
df_3 = df_3.drop(columns=['movie_idmovie_id'], axis = 1)

In [21]:
'''
This section of code is to create functions to run our code. The PreProcess function takes a given customer id. Then it 
filters our dataset for the movies that customer rated. Then we get a list of just those movies and apply it back to 
the overall dataset. This way when we run a our model, the nearest neighbors aren't the ones with many 0's for ratings.
From the PreProcessing function we receive a matrix to use with filtered values necessary for modeling.

The matrix_prep function takes the processed matrix and groups it so that we get a nxm matrix where n are the customers
and m are the movies they rated. If there is a movie a customer has not rated it gets a 0. The output is a sparse matrix 
with these results.

Finally, the Recommendation function takes the sparse matrix from the matrix_prep function, the customer id, 
and how many neighbors you want your model to have. The model is a nearestneighbor model that caluclates the 
cosine similarity between the provided customer and the other customers that rated the at least one of the
movies that the customer rated. 

Then we loop through the customers pulling out the similar customers and put this in a list. We then use this 
list to go back and filter for these customers movies that they rated a 4 or 5. Then we grab this list of movies
and this is the list returned.
'''

def PreProcess(customer_id):
    
    query_index = str(customer_id) #np.random.choice(ddf_3.shape[0])
    
    customer = df[df['customer_id'] == query_index]

    customer_movies = customer.loc[:, (customer != 0).any(axis = 0)]

    movies_to_include = customer_movies['movie_id'].tolist()

    mask = df['movie_id'].isin(movies_to_include)
    movies_matrix_for_sim = df.loc[~mask]
    
    movies_matrix_for_sim = movies_matrix_for_sim.append(customer_movies, ignore_index=True)
    
    return movies_matrix_for_sim

def matrix_prep(movies_matrix_for_sim):
    
    ddf_2 = movies_matrix_for_sim.groupby(['customer_id', 'movie_id']).customer_rating.mean().unstack(fill_value=0)
    
    
    mat_features = csr_matrix(ddf_2.values)
    
    return mat_features

    
def Recommendation(mat_features, customer_id, n_neighbors):
    
    query_index = str(customer_id)

    model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
    model_knn = model_knn.fit(mat_features)

    distances, indices = model_knn.kneighbors(ddf_2.loc[[query_index]], n_neighbors = n_neighbors)

    sim_customers_key = []
    sim_customers_vals = []
    for i in range(0, len(distances.flatten())):
        if i == 0: 
            #key = ddf_2.index[customer_id]
            #sim_customers_key.append(key)
            pass
        else: 
            val = ddf_2.index[indices.flatten()[i]]
            sim_customers_vals.append(val)
            
    mask = df_3['customer_id'].isin(sim_customers_vals)
    sim_customers = df_3.loc[~mask]
    
    #need orig customer to have filtered df_3 table
    orig_customer = df_3[df_3['customer_id'] == query_index]
    #mask = df_3['customer_id'].isin(sim_customers_key)
    #orig_customer = df_3.loc[~mask]
    
    mask = sim_customers['customer_rating'].isin([4,5])
    sim_customers = sim_customers.loc[~mask]
    
    orig_movies = orig_customer['movie_title'].values
    sim_movies = sim_customers['movie_title'].values
    
    rec_list = [i for i in sim_movies if i not in orig_movies]
    
    return rec_list
    

In [4]:
'''
This is implementing the PreProcess function for customer 1488844.
'''

matrix_1 = PreProcess(1488844)


In [5]:
'''
Due to memory issues I could not run matrix_prep with the two function in it. Thus I ran them separately.
This is the first part of the matrix_prep function.
'''

ddf_2 = matrix_1.groupby(['customer_id', 'movie_id']).customer_rating.mean().unstack(fill_value=0)

In [6]:
'''
Due to memory issues I could not run matrix_prep with the two function in it. Thus I ran them separately.
This is the second part of the matrix_prep function.
'''

mat_features = csr_matrix(ddf_2.values)

In [23]:
'''
This is the final function running the model and saving the results for customer 1488844 with 3 neighbors.
'''

recommended_for_1488844 = Recommendation(mat_features,1488844, 3)

In [26]:
'''
This is the firt 10 recommended movies for customer 1488844.
'''

recommended_for_1488844[0:10]

['The Rise and Fall of ECW',
 'Sick',
 'What the #$*! Do We Know!?',
 'Fighter',
 'Lord of the Rings: The Return of the King: Extended Edition: Bonus Material',
 'Nature: Antarctica',
 'Immortal Beloved',
 'Strange Relations',
 'Chump Change',
 'Inspector Morse 31: Death Is Now My Neighbour']