# Airbnb Housing Reviews - User-based Recommender

#### Group members: Kuangyi Zhang, Lanyixuan Xu, Jie Bao

## 1. Data Preprocessing

In [242]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string

In [243]:
housing_reviews = pd.read_table("reviews.csv", header='infer', delimiter=",", na_values='NaN')

In [244]:
housing_reviews.shape

(132353, 7)

In [245]:
housing_reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,rating,comments
0,1301530,13937403,6/8/14,55,Deborah,93,Great location downtown near everything! Host ...
1,192652,708419,11/10/11,99,Laura,97,"Stephanie was awesome! Highly recommended, gre..."
2,14081954,146698448,4/23/17,464,Jamison,100,The place was so cool. The whole family loved ...
3,9452418,75922280,5/23/16,706,Eamon,100,Anne is a very thoughtful host who left all th...
4,3385966,108126286,10/14/16,1143,Katie,97,"Chad's place is awesome! It's a quiet, clean, ..."


In [246]:
housing_reviews.describe(include="all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
listing_id,132353,,,,7321670.0,5086610.0,2384.0,2730610.0,6884330.0,11973100.0,18598100.0
id,132353,,,,82627500.0,41766600.0,849.0,48553200.0,86485500.0,114583000.0,151007000.0
date,132353,2267.0,3/19/17,780.0,,,,,,,
reviewer_id,132353,,,,39642600.0,32804100.0,55.0,11907900.0,31225200.0,61442300.0,129263000.0
reviewer_name,132353,22527.0,Michael,1333.0,,,,,,,
rating,132353,,,,94.834,4.5955,0.0,93.0,96.0,98.0,100.0
comments,132206,130167.0,The host canceled this reservation the day bef...,211.0,,,,,,,


### Drop the missing value 

In [247]:
drop_rating_index = housing_reviews.index[housing_reviews['rating'] == 0]
drop_rating_index.tolist()
len(drop_rating_index)

43

In [248]:
housing_reviews = housing_reviews.drop(housing_reviews.index[drop_rating_index])
housing_reviews.shape

(132310, 7)

In [249]:
housing_reviews = housing_reviews[~housing_reviews['comments'].isnull()] 
housing_reviews = housing_reviews[~housing_reviews['rating'].isnull()] 
housing_reviews.shape

(132163, 7)

In [250]:
recommend = housing_reviews[['listing_id', 'reviewer_id', 'rating']]
recommend

Unnamed: 0,listing_id,reviewer_id,rating
0,1301530,55,93
1,192652,99,97
2,14081954,464,100
3,9452418,706,100
4,3385966,1143,97
5,12153109,1169,92
6,12891430,1744,91
7,6150893,1956,98
8,1127661,2038,98
9,4667333,3240,90


In [251]:
upper_2 = recommend['reviewer_id'].value_counts() > 3
users = upper_2[upper_2].index.tolist()
print len(users)

582


In [252]:
recommend = recommend[recommend['reviewer_id'].isin(users)]
recommend.shape

(2999, 3)

In [253]:
recommend.groupby('reviewer_id')['reviewer_id'].count()

reviewer_id
50034         6
76903         4
104815        6
140995        5
183420        5
196524        5
210853        7
218987        4
228969        9
266837        4
272175        4
272671        4
325060        4
339072        5
344321        4
361347        4
376244        4
387036        4
422798        4
469552        4
485037        7
500760        5
546558        5
555266        7
593406        5
646754        6
700681        4
704642        9
757757        4
794216        4
             ..
97091849      4
97705358      4
98742409     35
100121030     4
100205425     5
101135232     4
101280146     5
101754816     4
103256617     4
103898963    11
105577955     4
105758734     4
106419727     4
106845239     6
106896964     4
108177511     4
109058956     5
109249300     7
109484542     7
109576079     4
110353664     5
111269521     4
111963179     6
112170044     4
113353272     4
114269563     4
115765194     4
116588584     5
118859868     4
122903399     8
Name: review

In [254]:
recommend.describe(include="all").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
listing_id,2999.0,7436787.0,5354434.0,2384.0,2503128.0,6700654.0,12590517.0,18139782.0
reviewer_id,2999.0,30455110.0,30173120.0,50034.0,6511426.0,21404852.0,44387734.0,122903399.0
rating,2999.0,95.32177,4.084109,58.0,94.0,96.0,98.0,100.0


In [255]:
np.unique(recommend["listing_id"]).shape

(1311,)

In [256]:
np.unique(recommend["reviewer_id"]).shape

(582,)

In [257]:
recommend.shape

(2999, 3)

In [258]:
list_id=recommend.iloc[0:,0].unique()
review_id = recommend.iloc[0:,1].unique()

In [259]:
print len(list_id), len(review_id)

1311 582


In [260]:
dict = {}
for user in review_id:
    dict[user] = {}
number = 0
for i in range(len(recommend)):
    dict[recommend.iloc[i].reviewer_id][recommend.iloc[i].listing_id] = recommend.iloc[i].rating

In [261]:
dict[50034]

{3984383: 99, 8824905: 95, 13857415: 94, 13858506: 98, 16146505: 99}

In [262]:
recommend[recommend['reviewer_id'] == 50034]

Unnamed: 0,listing_id,reviewer_id,rating
307,3984383,50034,99
308,3984383,50034,99
309,8824905,50034,95
310,13857415,50034,94
311,13858506,50034,98
312,16146505,50034,99


In [263]:
recommend_list = []
for user in review_id:
    user_review = []
    for list in list_id:
        if list in dict[user].keys():
            user_review.append(dict[user][list])
        else:
            user_review.append(0)
    recommend_list.append(user_review)

In [264]:
dataMat =np.array(recommend_list)
dataMat

array([[ 99,  95,  94, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ..., 100,  98,  95]])

## 2. Mean Absolute Error (MAE)

In [265]:
from numpy import *
from numpy import linalg as la
import numpy as np

In [266]:
U,Sigma,VT = la.svd(mat(dataMat))
Sig2=Sigma**2
sum(Sig2)

20834152.0

In [268]:
sum(Sig2[: 400])

19058481.782935306

In [269]:
def ecludSim(inA,inB):
    return 1.0 / (1.0 + la.norm(inA - inB))

def pearsSim(inA,inB):
    if len(inA) < 3 : return 1.0
    return 0.5 + 0.5 * corrcoef(inA, inB, rowvar = 0)[0][1]

def cosSim(inA,inB):
    num = float(inA.T * inB)
    denom = la.norm(inA)*la.norm(inB)
    return 0.5 + 0.5 * (num / denom)

def standEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0: continue
        overLap = nonzero(logical_and(dataMat[:,item]>0, \
                                      dataMat[:,j]>0))[0]
        if len(overLap) == 0: similarity = 0
        else: similarity = simMeas(dataMat[overLap,item], \
                                   dataMat[overLap,j])
        #print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating
    #print 'standEst:', ratSimTotal, simTotal  
    if simTotal == 0: 
        return 0
    else: 
        return ratSimTotal/simTotal
    
def svdEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    data=mat(dataMat)
    U,Sigma,VT = la.svd(data)
    Sig400 = mat(eye(400)*Sigma[:400]) #arrange Sig4 into a diagonal matrix
    xformedItems = data.T * U[:,:400] * Sig400.I  #create transformed items
    for j in range(n):
        userRating = data[user,j]
        if userRating == 0 or j==item: continue
        similarity = simMeas(xformedItems[item,:].T,\
                             xformedItems[j,:].T)
        #print 'svdEst:', similarity 
        #print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating 
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

def cross_validate_user(dataMat, user, test_ratio, estMethod=standEst, simMeas=ecludSim):
    number_of_items = np.shape(dataMat)[1]
    rated_items_by_user = np.array([i for i in range(number_of_items) if dataMat[user,i]>0])
    
    test_size = int(test_ratio * len(rated_items_by_user))
    test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
    withheld_items = rated_items_by_user[test_indices]
    original_user_profile = np.copy(dataMat[user])
    dataMat[user, withheld_items] = 0 # So that the withheld test items is not used in the rating estimation below
    error_u = 0.0
    count_u = len(withheld_items)

    # Compute absolute error for user u over all test items
    for item in withheld_items:
        # Estimate rating on the withheld item
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        #print 'estimatedScore:', estimatedScore
        error_u = error_u + abs(estimatedScore - original_user_profile[item])
        #print error_u
        
    # Now restore ratings of the withheld items to the user profile
    for item in withheld_items:
        dataMat[user, item] = original_user_profile[item]

    # Return sum of absolute errors and the count of test cases for this user
    # Note that these will have to be accumulated for each user to compute MAE
    return error_u, count_u

In [270]:
def testMAE(dataMat, test_ratio, Method):
    total_error = 0
    total_count = 0
    for i in range(np.shape(dataMat)[0]):
        if Method == "standEst":
            error_u, count_u = cross_validate_user(dataMat, i, test_ratio, estMethod=standEst)
        elif Method == "svdEst":
            error_u, count_u = cross_validate_user(dataMat, i, test_ratio, estMethod=svdEst)
        total_error += error_u
        total_count += count_u
        #print error_u, count_u
    #print total_error, total_count
    print 'Mean Absoloute Error for', Method, ':', total_error/total_count

In [271]:
testMAE(dataMat, 0.2, "svdEst")

Mean Absoloute Error for svdEst : 3.6594461785678183


## 3. User-based-Collaborative-Filtering

In [357]:
import operator
def most_similar_users_test(dataMat, userid, queryUser, k, metric=pearsSim):
    user  = dataMat[queryUser]
    sim = {}
    index = 0
    for i in dataMat:
        similarity = metric(i, user)
        if(similarity == 1):
            index = index + 1
            continue
        sim[index] = similarity
        index = index + 1
    sorted_keys = sorted(sim.items(), key=operator.itemgetter(1), reverse=True)
    print 'Selected User:'
    print userid[queryUser], '\n'
    print 'Top', k, 'Most similar users are :'
    index = 0
    for key, values in sorted_keys:
        if(index == k):
            break
        print userid[key], '\n'
        print '----------------------'
        index += 1
    return sorted_keys

In [307]:
print dataMat[0]

[99 95 94 ...  0  0  0]


In [358]:
similar_users = most_similar_users_test(dataMat, review_id, 4, 3)

Selected User:
183420 

Top 3 Most similar users are :
1424369 

----------------------
272671 

----------------------
76903 

----------------------


In [335]:
similar_users

[(45, 0.7275205321051251),
 (11, 0.6375155600616536),
 (1, 0.6212422194931204),
 (366, 0.6198533563922091),
 (119, 0.6099526718110662),
 (524, 0.6070805793454697),
 (62, 0.6025077435365275),
 (3, 0.5951111412947333),
 (37, 0.5848065052864915),
 (294, 0.5744633182021677),
 (66, 0.49914734810260414),
 (455, 0.49914734810260414),
 (48, 0.4991473481026041),
 (50, 0.4991473481026041),
 (52, 0.4991473481026041),
 (53, 0.4991473481026041),
 (88, 0.4991473481026041),
 (94, 0.4991473481026041),
 (107, 0.4991473481026041),
 (121, 0.4991473481026041),
 (122, 0.4991473481026041),
 (128, 0.4991473481026041),
 (180, 0.4991473481026041),
 (188, 0.4991473481026041),
 (197, 0.4991473481026041),
 (229, 0.4991473481026041),
 (252, 0.4991473481026041),
 (332, 0.4991473481026041),
 (349, 0.4991473481026041),
 (409, 0.4991473481026041),
 (410, 0.4991473481026041),
 (434, 0.4991473481026041),
 (469, 0.4991473481026041),
 (471, 0.4991473481026041),
 (487, 0.4991473481026041),
 (528, 0.4991473481026041),
 (560

In [336]:
nopreference_list = np.where(dataMat[0,:]==0)[0]
nopreference_list

array([   5,    6,    7, ..., 1308, 1309, 1310])

In [348]:
def predict_test(dataMat, user_id, user_index, list_index, similar_users, K):
    if len(similar_users) == 0:
        return 0.0
    print 'Target User:', user_id[user_index]
    numerator = 0.0
    denominator = 0.0
    index = 0
    print 'most_similar_users:'
    for key, values in similar_users:
        if(index == K):
            break
        if dataMat[key][list_index] != 0:
            neighbor_id = user_id[key]        
            neighbor_similarity = values
            rating = dataMat[key][list_index]
            print neighbor_id, neighbor_similarity, rating
            numerator += neighbor_similarity * rating
            denominator += neighbor_similarity
            index += 1
    result = numerator/denominator
    print 'Predict Score:', result
    return result 

In [349]:
#test
predict_test(dataMat, review_id, 4, 5, similar_users, 3)

Target User: 183420
most_similar_users:
76903 0.6212422194931204 86
21500802 0.5744633182021677 86
Predict Score: 86.0


86.0

In [363]:
import operator
def most_similar_users(dataMat, userid, queryUser, k, metric=pearsSim):
    user  = dataMat[queryUser]
    sim = {}
    index = 0
    for i in dataMat:
        similarity = metric(i, user)
        if(similarity == 1):
            index = index + 1
            continue
        sim[index] = similarity
        index = index + 1
    sorted_keys = sorted(sim.items(), key=operator.itemgetter(1), reverse=True)
    index = 0
    for key, values in sorted_keys:
        if(index == k):
            break
        index += 1
    return sorted_keys

In [364]:
def predict(dataMat, user_id, user_index, list_index, similar_users, K):
    if len(similar_users) == 0:
        return 0.0
    numerator = 0.0
    denominator = 0.0
    index = 0
    for key, values in similar_users:
        if(index == K):
            break
        if dataMat[key][list_index] != 0:
            neighbor_id = user_id[key]        
            neighbor_similarity = values
            rating = dataMat[key][list_index]
            numerator += neighbor_similarity * rating
            denominator += neighbor_similarity
            index += 1
    result = numerator/denominator
    return result 

In [370]:
def recommend_list(dataMat, user_id, list_id, queryUser, K):
    nopreference_list = np.where(dataMat[queryUser,:]==0)[0]
    predict_rating = {}
    sorted_most_similar_users = most_similar_users(dataMat, user_id, queryUser, K, metric=pearsSim)
    for item in nopreference_list:
        result = predict(dataMat, user_id, queryUser, item, sorted_most_similar_users, K)
        predict_rating[item] = result
    sorted_list = sorted(predict_rating.items(), key=operator.itemgetter(1), reverse=True)
    print 'Selected User:'
    print user_id[queryUser], '\n'
    print 'The listing we recommend to this user: ','\n'
    index = 0
    for key, values in sorted_list:
        if(index == K):
            break
        print list_id[key], '\n'
        print '----------------------'
        index += 1

In [371]:
recommend_list(dataMat, review_id, list_id, 4, 3)

Selected User:
183420 

The listing we recommend to this user:  

4785815 

----------------------
1725110 

----------------------
14167586 

----------------------


(array([4]),)