## BUILDING A RECOMMENDER SYSTEM ON USER-USER COLLABORATIVE FILTERING (MOVIELENS DATASET)

We will load the data sets firsts.

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import math

#column headers for the dataset
data_cols = ['user id','movie id','rating','timestamp']
item_cols = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller','War' ,'Western']
user_cols = ['user id','age','gender','occupation','zip code']

#importing the data files onto dataframes
data_df = pd.read_csv('ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')
item_df = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1')
user_df = pd.read_csv('ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')

#dropping unecessary columns
#Voting Timestamp - Removed
data_df.drop(data_df.columns[[3]], axis = 1, inplace = True)
#Movie Title, Video Release Date and IMDB URL - Removed
item_df.drop(item_df.columns[[1,3,4]], axis = 1, inplace = True)
#Occupation and Zip Code - Removed
user_df.drop(user_df.columns[[3,4]], axis = 1, inplace = True)

In [3]:
print(data_df.head())

   user id  movie id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1


In [4]:
print(item_df.head())

   movie id release date  unknown  Action  Adventure  Animation  Childrens  \
0         1  01-Jan-1995        0       0          0          1          1   
1         2  01-Jan-1995        0       1          1          0          0   
2         3  01-Jan-1995        0       0          0          0          0   
3         4  01-Jan-1995        0       1          0          0          0   
4         5  01-Jan-1995        0       0          0          0          0   

   Comedy  Crime  Documentary   ...     Fantasy  Film-Noir  Horror  Musical  \
0       1      0            0   ...           0          0       0        0   
1       0      0            0   ...           0          0       0        0   
2       0      0            0   ...           0          0       0        0   
3       1      0            0   ...           0          0       0        0   
4       0      1            0   ...           0          0       0        0   

   Mystery  Romance   Sci-Fi  Thriller  War  Western  
0

In [5]:
#Ajust release date to get only the year
item_df['release date'] = pd.to_datetime(item_df['release date'], errors='coerce').dt.year

In [6]:
print(item_df.head())

   movie id  release date  unknown  Action  Adventure  Animation  Childrens  \
0         1        1995.0        0       0          0          1          1   
1         2        1995.0        0       1          1          0          0   
2         3        1995.0        0       0          0          0          0   
3         4        1995.0        0       1          0          0          0   
4         5        1995.0        0       0          0          0          0   

   Comedy  Crime  Documentary   ...     Fantasy  Film-Noir  Horror  Musical  \
0       1      0            0   ...           0          0       0        0   
1       0      0            0   ...           0          0       0        0   
2       0      0            0   ...           0          0       0        0   
3       1      0            0   ...           0          0       0        0   
4       0      1            0   ...           0          0       0        0   

   Mystery  Romance   Sci-Fi  Thriller  War  Weste

In [7]:
print(user_df.head())

   user id  age gender
0        1   24      M
1        2   53      F
2        3   23      M
3        4   24      M
4        5   33      F


In [8]:
#Convert Gender column to numeric
user_df['gender'].replace('F', 1,inplace=True)
user_df['gender'].replace('M', 2,inplace=True)

In [9]:
#Adjust columns replacing NaN with the mean
meanYear = int(round(item_df['release date'].mean()))
print(meanYear)

1989


In [10]:
item_df['release date'] = item_df['release date'].fillna(meanYear)

In [11]:
print(item_df['release date'].hasnans)

False


In [12]:
#merge it all
data_item = pd.merge(data_df, item_df, left_on = "movie id", right_on = "movie id")
data_item_user = pd.merge(data_item, user_df, left_on = "user id", right_on = "user id")
dataset = data_item_user

In [13]:
print(dataset.head())

   user id  movie id  rating  release date  unknown  Action  Adventure  \
0      196       242       3        1997.0        0       0          0   
1      196       257       2        1997.0        0       1          1   
2      196       111       4        1996.0        0       0          0   
3      196        25       4        1996.0        0       0          0   
4      196       382       4        1994.0        0       0          0   

   Animation  Childrens  Comedy   ...    Horror  Musical  Mystery  Romance   \
0          0          0       1   ...         0        0        0         0   
1          0          0       1   ...         0        0        0         0   
2          0          0       1   ...         0        0        0         1   
3          0          0       1   ...         0        0        0         0   
4          0          0       1   ...         0        0        0         0   

   Sci-Fi  Thriller  War  Western  age  gender  
0       0         0    0       

In [14]:
# Data distribution
display(dataset.describe())

Unnamed: 0,user id,movie id,rating,release date,unknown,Action,Adventure,Animation,Childrens,Comedy,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,age,gender
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,1987.95631,0.0001,0.25589,0.13753,0.03605,0.07182,0.29832,...,0.05317,0.04954,0.05245,0.19461,0.1273,0.21872,0.09398,0.01854,32.96985,1.7426
std,266.61442,330.798356,1.125674,14.154889,0.01,0.436362,0.344408,0.186416,0.258191,0.457523,...,0.224373,0.216994,0.222934,0.395902,0.33331,0.41338,0.291802,0.134894,11.562623,0.437204
min,1.0,1.0,1.0,1922.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0
25%,254.0,175.0,3.0,1986.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,1.0
50%,447.0,322.0,4.0,1994.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,2.0
75%,682.0,631.0,4.0,1996.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,2.0
max,943.0,1682.0,5.0,1998.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,73.0,2.0


In [15]:
data_matrix = dataset.pivot(index = 'user id').fillna(0)
data_matrix.head()

ValueError: cannot label index with a null key