In [1]:
import sys

import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor

In [2]:
SHOW_DATA = True

## Load data

In [3]:
# Users file
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

# Ratings file
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

# Train + Test ratings
ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=ratings_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=ratings_cols, encoding='latin-1')

# Items file
items_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
              'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
              'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=items_cols, encoding='latin-1')

In [4]:
if SHOW_DATA:
    print "Users:", users.shape, "\n", users.head(), "\n\n"
    print "Ratings:", ratings.shape, "\n", ratings.head(), "\n\n"
    print "Items:", items.shape, "\n", items.head()

Users: (943, 5) 
   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213 


Ratings: (100000, 4) 
   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596 


Items: (1682, 24) 
   movie id        movie title release date  video release date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                 

## Item-based NN recommendations

In [5]:
# Fit a k=20 KDTree with the movies genres as features

movie_features = pd.DataFrame(data=items, columns=["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", 
                                                   "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
                                                   "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"])

knn = NearestNeighbors(n_neighbors=20).fit(movie_features)

In [6]:
# Use kNN to, given a certain movie genre, recommend similar movies

# Example: Animation + Children's + Comedy
distances, indices = knn.kneighbors(np.array([[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))

print "Recommendations for Animation + Children's + Comedy:"
for index in indices[0]:
    print "\t", items[items["movie id"] == (index + 1)]["movie title"].values[0]  # index is offset by 1!
    
# Example: Sci-Fi + Adventure
distances, indices = knn.kneighbors(np.array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]))

print "\n\nRecommendations for Sci-Fi + Adventure:"
for index in indices[0]:
    print "\t", items[items["movie id"] == (index + 1)]["movie title"].values[0]  # index is offset by 1!

Recommendations for Animation + Children's + Comedy:
	Aladdin and the King of Thieves (1996)
	Toy Story (1995)
	Air Bud (1997)
	George of the Jungle (1997)
	Little Rascals, The (1994)
	D3: The Mighty Ducks (1996)
	Home Alone (1990)
	Jungle2Jungle (1997)
	Heavyweights (1994)
	Beavis and Butt-head Do America (1996)
	Flintstones, The (1994)
	Santa Clause, The (1994)
	Mouse Hunt (1997)
	Wrong Trousers, The (1993)
	Grand Day Out, A (1992)
	Pinocchio (1940)
	Matilda (1996)
	Love Bug, The (1969)
	Aladdin (1992)
	Aristocats, The (1970)


Recommendations for Sci-Fi + Adventure:
	City of Lost Children, The (1995)
	Barbarella (1968)
	Star Trek V: The Final Frontier (1989)
	Star Trek: The Motion Picture (1979)
	Species (1995)
	Mark of Zorro, The (1940)
	Jurassic Park (1993)
	Star Trek VI: The Undiscovered Country (1991)
	Screamers (1995)
	Forbidden Planet (1956)
	Tetsuo II: Body Hammer (1992)
	Sphere (1998)
	Man Who Would Be King, The (1975)
	Alphaville (1965)
	Treasure of the Sierra Madre, The (1

## User-based NN recommendations

In [7]:
MAX_RATING = 5

def prepare_data(data):
    users_features = {}
    for idx, i in data.iterrows():
        movie = items[items["movie id"] == i["movie_id"]]
        movie_features = pd.DataFrame(data=movie, columns=["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
                                                           "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
                                                           "Mystery", "Romance", "Sci-Fi", "Thriller", "War",
                                                           "Western"]).as_matrix()[0]

        rating = i["rating"]
        user_id = i["user_id"]
        age = users[users["user_id"] == user_id]["age"].values[0]
        sex = users[users["user_id"] == user_id]["sex"].values[0]
        occupation = users[users["user_id"] == user_id]["occupation"].values[0]
        zip_code = users[users["user_id"] == user_id]["zip_code"].values[0]
        
        try:
            float(users[users["user_id"] == user_id]["zip_code"].values[0])
        except ValueError, _:
            # Some zip codes are not a string of digits. Due to way in which we will later adjust the zip codes feature, we will
            # ignore the users with such zip codes.
            continue

        weighted_movie_features = rating * movie_features / float(MAX_RATING)

        if user_id not in users_features:
            users_features[user_id] = {
                "data": {
                    "age": age,
                    "sex": sex,
                    "occupation": occupation,
                    "zip_code": zip_code,
                },
                "weighted_movie_features": weighted_movie_features,
                "movie_count": 1,
                "average_rating": rating
            }
        else:
            curr_weighted_movie_features = users_features[user_id]["weighted_movie_features"]
            curr_movie_count = users_features[user_id]["movie_count"]
            curr_average_rating = users_features[user_id]["average_rating"]
            curr_data = users_features[user_id]["data"]

            updated_weighted_movie_features = curr_weighted_movie_features + weighted_movie_features
            updated_movie_count = curr_movie_count + 1
            updated_average_rating = curr_average_rating + rating

            users_features[user_id] = {
                "data": curr_data,
                "weighted_movie_features": updated_weighted_movie_features,
                "movie_count": updated_movie_count,
                "average_rating": updated_average_rating
            }

        sys.stdout.write("\r%0.2f%%" % (float(idx) / data.shape[0] * 100))
        sys.stdout.flush()


    print "\nJust one more thing..."
    idx = 0
    for _, user in users_features.iteritems():
        user["weighted_movie_features"] = np.round(user["weighted_movie_features"] / float(user["movie_count"]), 2)
        user["average_rating"] = np.round(user["average_rating"] / float(user["movie_count"]), 2)

        sys.stdout.write("\r%0.2f%%" % (float(idx) / len(users_features) * 100))
        sys.stdout.flush()

        idx += 1
        
    return users_features

print "Preparing data for training..."
train_data = prepare_data(ratings_train)

print "\n\nPreparing data for testing..."
test_data = prepare_data(ratings_test)

print "\n\nDone!"

Preparing data for training...
100.00%
Just one more thing...
99.89%

Preparing data for testing...
99.99%
Just one more thing...
99.89%

Done!


In [8]:
# Recommendations for current users

# Example for user 1
distances, indices = knn.kneighbors(np.array([train_data[1]["weighted_movie_features"]]))

print "Recommendations for User 1:"
for index in indices[0]:
    print "\t", items[items["movie id"] == (index + 1)]["movie title"].values[0]  # index is offset by 1!

Recommendations for User 1:
	Good Morning (1971)
	unknown
	War at Home, The (1996)
	Eighth Day, The (1996)
	Next Step, The (1995)
	All Things Fair (1996)
	Bitter Sugar (Azucar Amargo) (1996)
	Silence of the Palace, The (Saimt el Qusur) (1994)
	Sunchaser, The (1996)
	Mamma Roma (1962)
	Sweet Nothing (1995)
	Mat' i syn (1997)
	Etz Hadomim Tafus (Under the Domin Tree) (1994)
	Desert Winds (1995)
	Cérémonie, La (1995)
	Lamerica (1994)
	Brothers in Trouble (1995)
	Further Gesture, A (1996)
	Kika (1993)
	Two Friends (1986) 


### Addressing the cold start issue in User-based NN recommendations

In recommendation systems, the cold start issue happens when a new user with no previous item viewings/ratings is
added to the system.

https://en.wikipedia.org/wiki/Cold_start

#### Adjust the data

In [9]:
# Occupations
# Simply create a map from occupations to integers

def fix_occupations(data):
    occupations = set()
    for _, i in data.iteritems():
        occupations.add(i["data"]["occupation"])

    occupation_map = {occupation: idx for idx, occupation in enumerate(occupations)}

    for _, v in data.iteritems():
        v["data"]["occupation"] = occupation_map[v["data"]["occupation"]]

fix_occupations(train_data)
fix_occupations(test_data)

In [10]:
# Zip Code
#
# The US Zip Codes can be decoded in the following way (http://about.usps.com/news/state-releases/fl/2009/fl_2009_1028a.htm):
#     - the first digit represents a general region of the country (0 is in the East and 9 is in the West)
#     - the second and third digits represent regional areas
#     - the last two represent specific post offices
#
# Since the zip code representation is very small in this dataset, we will only consider the first value as geographical
# information, which is still useful data and should be much more represented.

def fix_zip_code(data):
    for _, v in data.iteritems():
        v["data"]["zip_code"] = int(v["data"]["zip_code"][0])

fix_zip_code(train_data)
fix_zip_code(test_data)

In [11]:
# Sex
# 1 is Female, 2 is Male

def fix_sex(data):
    for _, v in data.iteritems():
        if v["data"]["sex"] == "M":
            v["data"]["sex"] = 2
        else:
            v["data"]["sex"] = 1

fix_sex(train_data)
fix_sex(test_data)

In [12]:
# Age
# Create a set of age groups

def fix_age(data):
    groups = [
        {"min": 0, "max": 15, "code": 0},
        {"min": 15, "max": 25, "code": 1},
        {"min": 25, "max": 35, "code": 2},
        {"min": 35, "max": 30, "code": 3},
        {"min": 50, "max": 150, "code": 4}
    ]

    for _, v in data.iteritems():
        age = v["data"]["age"]
        for group in groups:
            if age >= group["min"] and age < group["max"]:
                v["data"]["age"] = group["code"]

fix_age(train_data)
fix_age(test_data)

#### Use a Random Forest Regressor to fit the data

In [13]:
rfc = RandomForestRegressor()

X = [user["data"].values() for _, user in train_data.iteritems()]
y = [user["weighted_movie_features"] for _, user in train_data.iteritems()]

rfc.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [None]:
X_test = [user["data"].values() for _, user in test_data.iteritems()]
y_test = [user["weighted_movie_features"] for _, user in test_data.iteritems()]

resulpply(X_test)

In [None]:
for x in X:
    for i in x:
        if isinstance(i, basestring):
            print i

In [None]:
float("asd")