# Popularity Based Recommenders

In [3]:
import numpy as np
import pandas as pd

In [4]:
frame = pd.read_csv('rating_final.csv')
cuisine = pd.read_csv('chefmozcuisine.csv')

In [5]:
frame.head()
cuisine.head()

Unnamed: 0,placeID,Rcuisine
0,135110,Spanish
1,135109,Italian
2,135107,Latin_American
3,135106,Mexican
4,135105,Fast_Food


# Recommending based on counts

In [6]:
rating_count = pd.DataFrame(frame.groupby('placeID')['rating'].count())
#sort in descending order of the rating
rating_count.sort_values('rating',ascending=False).head()

Unnamed: 0_level_0,rating
placeID,Unnamed: 1_level_1
135085,36
132825,32
135032,28
135052,25
132834,25


In [7]:
most_rated_places = pd.DataFrame([135085,132825,135032,135052,132834],index=np.arange(5),columns=['placeID'])
summary = pd.merge(most_rated_places,cuisine,on='placeID')
summary

Unnamed: 0,placeID,Rcuisine
0,135085,Fast_Food
1,132825,Mexican
2,135032,Cafeteria
3,135032,Contemporary
4,135052,Bar
5,135052,Bar_Pub_Brewery
6,132834,Mexican


In [8]:
cuisine['Rcuisine'].describe()

count         916
unique         59
top       Mexican
freq          239
Name: Rcuisine, dtype: object

# Making Recomendation based on Correlation 

In [9]:
geodata = pd.read_csv('geoplaces2.csv')
geodata.head()

Unnamed: 0,placeID,latitude,longitude,the_geom_meter,name,address,city,state,country,fax,...,alcohol,smoking_area,dress_code,accessibility,price,url,Rambience,franchise,area,other_services
0,134999,18.915421,-99.184871,0101000020957F000088568DE356715AC138C0A525FC46...,Kiku Cuernavaca,Revolucion,Cuernavaca,Morelos,Mexico,?,...,No_Alcohol_Served,none,informal,no_accessibility,medium,kikucuernavaca.com.mx,familiar,f,closed,none
1,132825,22.147392,-100.983092,0101000020957F00001AD016568C4858C1243261274BA5...,puesto de tacos,esquina santos degollado y leon guzman,s.l.p.,s.l.p.,mexico,?,...,No_Alcohol_Served,none,informal,completely,low,?,familiar,f,open,none
2,135106,22.149709,-100.976093,0101000020957F0000649D6F21634858C119AE9BF528A3...,El Rinc�n de San Francisco,Universidad 169,San Luis Potosi,San Luis Potosi,Mexico,?,...,Wine-Beer,only at bar,informal,partially,medium,?,familiar,f,open,none
3,132667,23.752697,-99.163359,0101000020957F00005D67BCDDED8157C1222A2DC8D84D...,little pizza Emilio Portes Gil,calle emilio portes gil,victoria,tamaulipas,?,?,...,No_Alcohol_Served,none,informal,completely,low,?,familiar,t,closed,none
4,132613,23.752903,-99.165076,0101000020957F00008EBA2D06DC8157C194E03B7B504E...,carnitas_mata,lic. Emilio portes gil,victoria,Tamaulipas,Mexico,?,...,No_Alcohol_Served,permitted,informal,completely,medium,?,familiar,t,closed,none


In [10]:
places = geodata[['placeID','name']]
places.head()

Unnamed: 0,placeID,name
0,134999,Kiku Cuernavaca
1,132825,puesto de tacos
2,135106,El Rinc�n de San Francisco
3,132667,little pizza Emilio Portes Gil
4,132613,carnitas_mata


# Grouping and Ranking Data

In [11]:
rating_mean = pd.DataFrame(frame.groupby('placeID')['rating'].mean())
rating_mean.head()

Unnamed: 0_level_0,rating
placeID,Unnamed: 1_level_1
132560,0.5
132561,0.75
132564,1.25
132572,1.0
132583,1.0


In [12]:
rating_mean['rating_count'] = rating_count['rating']
rating_mean.head()

Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132560,0.5,4
132561,0.75,4
132564,1.25,4
132572,1.0,15
132583,1.0,4


In [13]:
rating_mean.describe()

Unnamed: 0,rating,rating_count
count,130.0,130.0
mean,1.179622,8.930769
std,0.349354,6.124279
min,0.25,3.0
25%,1.0,5.0
50%,1.181818,7.0
75%,1.4,11.0
max,2.0,36.0


In [15]:
rating_mean.sort_values('rating_count',ascending=False).head()

Unnamed: 0_level_0,rating,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135085,1.333333,36
132825,1.28125,32
135032,1.178571,28
135052,1.28,25
132834,1.0,25


In [16]:
places[places['placeID']==135085]

Unnamed: 0,placeID,name
121,135085,Tortas Locas Hipocampo


In [17]:
cuisine[cuisine['placeID']==135085]

Unnamed: 0,placeID,Rcuisine
44,135085,Fast_Food


# Preparing data Analysis

In [18]:
places_crosstab = pd.pivot_table(data=frame,values='rating',index='userID',columns='placeID')

In [19]:
places_crosstab.head()

placeID,132560,132561,132564,132572,132583,132584,132594,132608,132609,132613,...,135080,135081,135082,135085,135086,135088,135104,135106,135108,135109
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1001,,,,,,,,,,,...,,,,0.0,,,,,,
U1002,,,,,,,,,,,...,,,,1.0,,,,1.0,,
U1003,,,,,,,,,,,...,2.0,,,,,,,,,
U1004,,,,,,,,,,,...,,,,,,,,2.0,,
U1005,,,,,,,,,,,...,,,,,,,,,,


In [20]:
tortas_rating = places_crosstab[135085]
tortas_rating[tortas_rating>=0]

userID
U1001    0.0
U1002    1.0
U1007    1.0
U1013    1.0
U1016    2.0
U1027    1.0
U1029    1.0
U1032    1.0
U1033    2.0
U1036    2.0
U1045    2.0
U1046    1.0
U1049    0.0
U1056    2.0
U1059    2.0
U1062    0.0
U1077    2.0
U1081    1.0
U1084    2.0
U1086    2.0
U1089    1.0
U1090    2.0
U1092    0.0
U1098    1.0
U1104    2.0
U1106    2.0
U1108    1.0
U1109    2.0
U1113    1.0
U1116    2.0
U1120    0.0
U1122    2.0
U1132    2.0
U1134    2.0
U1135    0.0
U1137    2.0
Name: 135085, dtype: float64

# Evaluating Similarity based on Correlation

In [24]:
similar_to_Tortas = places_crosstab.corrwith(tortas_rating)
corr_Tortas = pd.DataFrame(similar_to_Tortas,columns=['PearsonR'])
corr_Tortas.dropna(inplace=True)
corr_Tortas.head()

Unnamed: 0_level_0,PearsonR
placeID,Unnamed: 1_level_1
132572,-0.428571
132723,0.301511
132754,0.930261
132825,0.700745
132834,0.814823


In [30]:
corr_Tortas_summary = corr_Tortas.join(rating_mean['rating_count'])
corr_Tortas_summary.head()

Unnamed: 0_level_0,PearsonR,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
132572,-0.428571,15
132723,0.301511,12
132754,0.930261,13
132825,0.700745,32
132834,0.814823,25


In [32]:
#filtering to show places with a minimun of 10 review count
corr_Tortas_summary[corr_Tortas_summary['rating_count']>=10].sort_values('PearsonR',ascending=False).head(10)

Unnamed: 0_level_0,PearsonR,rating_count
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1
135076,1.0,13
135085,1.0,36
135066,1.0,12
132754,0.930261,13
135045,0.912871,13
135062,0.898933,21
135028,0.892218,15
135042,0.881409,20
135046,0.867722,11
132872,0.840168,12


In [37]:
#places need to have than 1 reviewer in common
places_corr_Tortas = pd.DataFrame([135085,132754,135045,135062,135028,135042,135046],index=np.arange(7),columns=['placeID'])
summary = pd.merge(places_corr_Tortas,cuisine,on='placeID')
summary

Unnamed: 0,placeID,Rcuisine
0,135085,Fast_Food
1,132754,Mexican
2,135028,Mexican
3,135042,Chinese
4,135046,Fast_Food


In [38]:
places[places['placeID']==135046]

Unnamed: 0,placeID,name
42,135046,Restaurante El Reyecito


# Classification Based Collaborative Filtering Systems (User based)

## Logistic Regression as a classifier

In [16]:
from pandas import Series,DataFrame
from sklearn.linear_model import LogisticRegression


In [17]:
bank_full = pd.read_csv('bank_full_w_dummy_vars.csv')
bank_full.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,job_unknown,job_retired,job_services,job_self_employed,job_unemployed,job_maid,job_student,married,single,divorced
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0,0,0,0,0,0,0,1,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0,0,0,0,0,0,0,0,1,1
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,0,0,0,0,0,0,0,1,0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0,0,0,0,0,0,0,1,0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,1,0,0,0,0,0,0,0,1,1


In [18]:
bank_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 37 columns):
age                             45211 non-null int64
job                             45211 non-null object
marital                         45211 non-null object
education                       45211 non-null object
default                         45211 non-null object
balance                         45211 non-null int64
housing                         45211 non-null object
loan                            45211 non-null object
contact                         45211 non-null object
day                             45211 non-null int64
month                           45211 non-null object
duration                        45211 non-null int64
campaign                        45211 non-null int64
pdays                           45211 non-null int64
previous                        45211 non-null int64
poutcome                        45211 non-null object
y                               45

In [23]:
X = bank_full.ix[:,(18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36)].values
y = bank_full.ix[:,17].values

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


In [24]:
LogReg = LogisticRegression()
LogReg.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
new_user = np.array([0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1]).reshape(1,-1)
y_pred = LogReg.predict(new_user)
y_pred

array([0])

# Model Based Collaborative Filtering

## SVD Matrix Decomposition

In [4]:
import sklearn
from sklearn.decomposition import TruncatedSVD 

In [7]:
columns = ['user_id','item_id','rating','timestamp']
frame = pd.read_csv("ml-100k/u.data",sep='\t',names=columns)
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [15]:
columns = ['item_id','movie title','release date','','video release date','IMDb URL','unknown','Action','Adventure','Animation','Childrens','Comedy','Crime','Documentary'
          'Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv("ml-100k/u.item",sep='|',names=columns,encoding='latin-1')
movies.head()

Unnamed: 0,item_id,movie title,release date,Unnamed: 4,video release date,IMDb URL,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [29]:
movie_names = pd.DataFrame(movies['movie title'],movies['item_id'])
movie_names.head()

Unnamed: 0_level_0,movie title
item_id,Unnamed: 1_level_1
1,GoldenEye (1995)
2,Four Rooms (1995)
3,Get Shorty (1995)
4,Copycat (1995)
5,Shanghai Triad (Yao a yao yao dao waipo qiao) ...


In [30]:
combined_movies_data = pd.merge(frame,movie_names,on='item_id')
combined_movies_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Jungle2Jungle (1997)
1,63,242,3,875747190,Jungle2Jungle (1997)
2,226,242,5,883888671,Jungle2Jungle (1997)
3,154,242,3,879138235,Jungle2Jungle (1997)
4,306,242,5,876503793,Jungle2Jungle (1997)


In [32]:
combined_movies_data.groupby('item_id')['rating'].count().sort_values(ascending=False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

In [35]:
Filter = combined_movies_data['item_id']==50-1
combined_movies_data[Filter]['movie title'].unique()

array([u'Star Wars (1977)'], dtype=object)

## Building utility matrix

In [38]:
rating_crosstab = combined_movies_data.pivot_table(values='rating',index='user_id',columns='movie title',fill_value=0)
rating_crosstab.head()

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,5,5,0,0,1,4,0,0,...,0,0,0,5,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0


## Transposing the matrix

In [40]:
rating_crosstab.shape

(943, 1663)

In [41]:
X = rating_crosstab.values.T
X.shape

(1663, 943)

## Decomposing the matrix

In [42]:
SVD = TruncatedSVD(n_components=12,random_state=17)
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape

(1663, 12)

## Generating a Correlation matrix

In [44]:
corr_matrix = np.corrcoef(resultant_matrix)
corr_matrix.shape

(1663, 1663)

## Isolating Star Wars from the Corr_Matrix

In [45]:
movie_names = rating_crosstab.columns
movie_list = list(movie_names)

starwars = movie_list.index('Star Wars (1977)')
print(starwars)

1398


In [46]:
corr_starwars = corr_matrix[starwars]
print(corr_starwars)

[0.17919664 0.17510535 0.37620326 ... 0.43621337 0.22772043 0.31941683]


## Recommending highly correlated movie

In [51]:
list(movie_names[(corr_starwars < 1.0) & (corr_starwars > 0.94)])

[u'Ace Ventura: Pet Detective (1994)',
 u'Beyond Bedlam (1993)',
 u'Blade Runner (1982)',
 u'Boys on the Side (1995)',
 u'Charade (1963)',
 u'Dave (1993)',
 u'Jade (1995)',
 u'Jane Eyre (1996)',
 u'Jaws 2 (1978)',
 u'Just Cause (1995)',
 u'Little Odessa (1994)',
 u'Maverick (1994)',
 u'Nell (1994)',
 u'Radioland Murders (1994)',
 u'Richie Rich (1994)',
 u'Timecop (1994)']

In [53]:
list(movie_names[(corr_starwars < 1.0) & (corr_starwars > 0.98)])

[u'Little Odessa (1994)']

# Content Based Recommender

## Nearest Neighbour Algorithm

In [8]:
from sklearn.neighbors import NearestNeighbors

In [4]:
cars = pd.read_csv('mtcars.csv')
cars.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [5]:
cars.columns = ['car_names','mpg','cyl','disp','hp','dart','wt','qsec','vs','am','gear','carb']
cars.head()

Unnamed: 0,car_names,mpg,cyl,disp,hp,dart,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [6]:
# test set
t = [15,300,160,3.2]
X = cars.ix[:,(1,3,4,6)].values
X[0:5]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()


array([[ 21.   , 160.   , 110.   ,   2.62 ],
       [ 21.   , 160.   , 110.   ,   2.875],
       [ 22.8  , 108.   ,  93.   ,   2.32 ],
       [ 21.4  , 258.   , 110.   ,   3.215],
       [ 18.7  , 360.   , 175.   ,   3.44 ]])

In [9]:
nbrs = NearestNeighbors(n_neighbors=1).fit(X)

In [10]:
print(nbrs.kneighbors([t]))

(array([[10.77474942]]), array([[22]]))


In [14]:
cars[22:]

Unnamed: 0,car_names,mpg,cyl,disp,hp,dart,wt,qsec,vs,am,gear,carb
22,AMC Javelin,15.2,8,304.0,150,3.15,3.435,17.3,0,0,3,2
23,Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,0,0,3,4
24,Pontiac Firebird,19.2,8,400.0,175,3.08,3.845,17.05,0,0,3,2
25,Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,1,4,1
26,Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,1,5,2
27,Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
28,Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4
29,Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,1,5,6
30,Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8
31,Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,1,1,4,2


# Evaluating Recommender Systems

In [15]:
from sklearn.metrics import classification_report

In [26]:
X = bank_full.ix[:,(18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36)].values
y = bank_full.ix[:,17].values

LogReg.fit(X,y)
y_pred = LogReg.predict(X)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app


In [27]:
print(classification_report(y,y_pred))

             precision    recall  f1-score   support

          0       0.90      0.99      0.94     39922
          1       0.67      0.17      0.27      5289

avg / total       0.87      0.89      0.86     45211



In [28]:
# of the entire dataset, 87% of the products recommender were products that the user actually liked.
# 89% of the user's preferred products were recommended