# Collaborative Filtering

#### For this model there are to ways to approach it either by
    - by attributes
    - by user
    
    combined the two will lead to a better travel recommendation.

With the data provided I believe it would be best to combine the two.

### Here are the steps I am taking to creating the model
    - cleaning the data
    - finding similar users
    - creating a relationship Matrix
    - recommending location

In [119]:
import pandas as pd
import numpy as np

#### Reading the original dataset

In [120]:
original_file = "../assets/dataset/original_google_review_ratings.csv"

In [121]:
original_file = pd.read_csv(original_file)

### Converting the file to a dataframe for manipulation and cleaning

In [122]:
original_df = pd.DataFrame(original_file)

In [123]:
original_df.columns

Index(['User', 'Category 1', 'Category 2', 'Category 3', 'Category 4',
       'Category 5', 'Category 6', 'Category 7', 'Category 8', 'Category 9',
       'Category 10', 'Category 11', 'Category 12', 'Category 13',
       'Category 14', 'Category 15', 'Category 16', 'Category 17',
       'Category 18', 'Category 19', 'Category 20', 'Category 21',
       'Category 22', 'Category 23', 'Category 24', 'Unnamed: 25'],
      dtype='object')

### Cleaning the data file.

#### We need to add the column names to the data file
#### the columns was provided seperately so we need to merge the two and re-index the whole data.

In [125]:
COLUMN_NAMES = ['user_id', 'churches', 'resorts', 'beaches',
                'parks', 'theatres', 'museums', 'malls', 'zoo',
                'restaurants', 'pubs_bars', 'local_services',
                'burger_pizza_shops', 'hotels_other_lodgings',
                'juice_bars', 'art_galleries', 'dance_clubs',
                 'swimming_pools', 'gyms', 'bakeries', 'beauty_spas',
                'cafes', 'view_points', 'monuments', 'gardens', 'Unnamed: 25']

In [126]:
original_df.columns = COLUMN_NAMES

In [127]:
original_df.columns

Index(['user_id', 'churches', 'resorts', 'beaches', 'parks', 'theatres',
       'museums', 'malls', 'zoo', 'restaurants', 'pubs_bars', 'local_services',
       'burger_pizza_shops', 'hotels_other_lodgings', 'juice_bars',
       'art_galleries', 'dance_clubs', 'swimming_pools', 'gyms', 'bakeries',
       'beauty_spas', 'cafes', 'view_points', 'monuments', 'gardens',
       'Unnamed: 25'],
      dtype='object')

### We neeed now to write it to a new data file

In [128]:
original_df.to_csv('../assets/dataset/Google_reviews_ratings.csv')

##### Calling the data set from data-set folder

In [129]:
data_file = '../assets/dataset/Google_reviews_ratings.csv'

In [130]:
file = pd.read_csv(data_file)
file.head()

Unnamed: 0.1,Unnamed: 0,user_id,churches,resorts,beaches,parks,theatres,museums,malls,zoo,...,dance_clubs,swimming_pools,gyms,bakeries,beauty_spas,cafes,view_points,monuments,gardens,Unnamed: 25
0,0,User 1,0.0,0.0,3.63,3.65,5.0,2.92,5.0,2.35,...,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,
1,1,User 2,0.0,0.0,3.63,3.65,5.0,2.92,5.0,2.64,...,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,
2,2,User 3,0.0,0.0,3.63,3.63,5.0,2.92,5.0,2.64,...,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,
3,3,User 4,0.0,0.5,3.63,3.63,5.0,2.92,5.0,2.35,...,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,
4,4,User 5,0.0,0.0,3.63,3.63,5.0,2.92,5.0,2.64,...,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,


In [131]:
# dataframe
df = pd.DataFrame(file)
df

Unnamed: 0.1,Unnamed: 0,user_id,churches,resorts,beaches,parks,theatres,museums,malls,zoo,...,dance_clubs,swimming_pools,gyms,bakeries,beauty_spas,cafes,view_points,monuments,gardens,Unnamed: 25
0,0,User 1,0.00,0.00,3.63,3.65,5.00,2.92,5.00,2.35,...,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00,
1,1,User 2,0.00,0.00,3.63,3.65,5.00,2.92,5.00,2.64,...,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00,
2,2,User 3,0.00,0.00,3.63,3.63,5.00,2.92,5.00,2.64,...,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00,
3,3,User 4,0.00,0.50,3.63,3.63,5.00,2.92,5.00,2.35,...,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00,
4,4,User 5,0.00,0.00,3.63,3.63,5.00,2.92,5.00,2.64,...,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5451,5451,User 5452,0.91,5.00,4.00,2.79,2.77,2.57,2.43,1.09,...,0.66,0.65,0.66,0.69,5.00,1.05,5.0,5.0,1.56,
5452,5452,User 5453,0.93,5.00,4.02,2.79,2.78,2.57,1.77,1.07,...,0.65,0.64,0.65,1.59,1.62,1.06,5.0,5.0,1.09,
5453,5453,User 5454,0.94,5.00,4.03,2.80,2.78,2.57,1.75,1.05,...,0.65,0.63,0.64,0.74,5.00,1.07,5.0,5.0,1.11,
5454,5454,User 5455,0.95,4.05,4.05,2.81,2.79,2.44,1.76,1.03,...,0.64,0.63,0.64,0.75,5.00,1.08,5.0,5.0,1.12,


### Droping the first columns and cleaning any null values

In [132]:
df.columns

Index(['Unnamed: 0', 'user_id', 'churches', 'resorts', 'beaches', 'parks',
       'theatres', 'museums', 'malls', 'zoo', 'restaurants', 'pubs_bars',
       'local_services', 'burger_pizza_shops', 'hotels_other_lodgings',
       'juice_bars', 'art_galleries', 'dance_clubs', 'swimming_pools', 'gyms',
       'bakeries', 'beauty_spas', 'cafes', 'view_points', 'monuments',
       'gardens', 'Unnamed: 25'],
      dtype='object')

In [133]:
# with jupyter notebook caching you can run only this
df = df.drop(['Unnamed: 0'], axis=1)
df = df.drop(['Unnamed: 25'], axis=1)

In [134]:
df.head()

Unnamed: 0,user_id,churches,resorts,beaches,parks,theatres,museums,malls,zoo,restaurants,...,art_galleries,dance_clubs,swimming_pools,gyms,bakeries,beauty_spas,cafes,view_points,monuments,gardens
0,User 1,0.0,0.0,3.63,3.65,5.0,2.92,5.0,2.35,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
1,User 2,0.0,0.0,3.63,3.65,5.0,2.92,5.0,2.64,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
2,User 3,0.0,0.0,3.63,3.63,5.0,2.92,5.0,2.64,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
3,User 4,0.0,0.5,3.63,3.63,5.0,2.92,5.0,2.35,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
4,User 5,0.0,0.0,3.63,3.63,5.0,2.92,5.0,2.64,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [135]:
df.dropna()

Unnamed: 0,user_id,churches,resorts,beaches,parks,theatres,museums,malls,zoo,restaurants,...,art_galleries,dance_clubs,swimming_pools,gyms,bakeries,beauty_spas,cafes,view_points,monuments,gardens
0,User 1,0.00,0.00,3.63,3.65,5.00,2.92,5.00,2.35,2.33,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
1,User 2,0.00,0.00,3.63,3.65,5.00,2.92,5.00,2.64,2.33,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
2,User 3,0.00,0.00,3.63,3.63,5.00,2.92,5.00,2.64,2.33,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
3,User 4,0.00,0.50,3.63,3.63,5.00,2.92,5.00,2.35,2.33,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
4,User 5,0.00,0.00,3.63,3.63,5.00,2.92,5.00,2.64,2.33,...,1.74,0.59,0.50,0.00,0.50,0.00,0.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5451,User 5452,0.91,5.00,4.00,2.79,2.77,2.57,2.43,1.09,1.77,...,5.00,0.66,0.65,0.66,0.69,5.00,1.05,5.0,5.0,1.56
5452,User 5453,0.93,5.00,4.02,2.79,2.78,2.57,1.77,1.07,1.76,...,0.89,0.65,0.64,0.65,1.59,1.62,1.06,5.0,5.0,1.09
5453,User 5454,0.94,5.00,4.03,2.80,2.78,2.57,1.75,1.05,1.75,...,0.87,0.65,0.63,0.64,0.74,5.00,1.07,5.0,5.0,1.11
5454,User 5455,0.95,4.05,4.05,2.81,2.79,2.44,1.76,1.03,1.74,...,5.00,0.64,0.63,0.64,0.75,5.00,1.08,5.0,5.0,1.12


#### After cleaning with no-null values its best pracice to visulize the data

In [136]:
df.corr()

  df.corr()


Unnamed: 0,churches,resorts,beaches,parks,theatres,museums,malls,zoo,restaurants,pubs_bars,...,art_galleries,dance_clubs,swimming_pools,gyms,bakeries,beauty_spas,cafes,view_points,monuments,gardens
churches,1.0,0.248991,0.148805,0.070692,0.035451,-0.093143,-0.264632,-0.181267,-0.290214,-0.274683,...,-0.134914,0.067285,0.130099,0.16036,0.180271,0.199397,0.309238,0.364375,0.407033,0.48926
resorts,0.248991,1.0,0.325011,0.167338,0.15352,0.053916,-0.050158,-0.0045,-0.05055,-0.092526,...,-0.06667,-0.033119,-0.077146,-0.025446,0.032529,0.096974,0.08899,0.018366,0.077616,0.125172
beaches,0.148805,0.325011,1.0,0.396786,0.329925,0.161242,-0.072713,-0.186308,-0.219842,-0.17909,...,-0.134203,-0.022249,-0.084643,-0.11981,-0.075402,-0.009202,0.001105,0.134571,0.117458,0.082669
parks,0.070692,0.167338,0.396786,1.0,0.626868,0.315354,-0.067914,-0.128064,-0.169442,-0.115515,...,-0.27213,0.018274,-0.132468,-0.183849,-0.192665,-0.092453,-0.056226,0.276727,0.17355,0.090533
theatres,0.035451,0.15352,0.329925,0.626868,1.0,0.489937,0.077807,-0.002595,-0.17043,-0.100191,...,-0.323902,-0.056347,-0.184345,-0.243357,-0.256987,-0.191997,-0.133433,0.124431,0.128055,0.098853
museums,-0.093143,0.053916,0.161242,0.315354,0.489937,1.0,0.382774,0.200528,0.114719,-0.019007,...,-0.187512,-0.149295,-0.229604,-0.271354,-0.266608,-0.227362,-0.197454,-0.090481,-0.081109,-0.066423
malls,-0.264632,-0.050158,-0.072713,-0.067914,0.077807,0.382774,1.0,0.406959,0.432054,0.256563,...,0.092527,-0.142861,-0.207275,-0.226475,-0.272979,-0.232408,-0.257728,-0.360287,-0.22308,-0.247569
zoo,-0.181267,-0.0045,-0.186308,-0.128064,-0.002595,0.200528,0.406959,1.0,0.53687,0.551409,...,-0.064692,-0.124417,-0.20295,-0.236106,-0.281018,-0.254393,-0.274733,-0.25552,-0.167927,-0.137292
restaurants,-0.290214,-0.05055,-0.219842,-0.169442,-0.17043,0.114719,0.432054,0.53687,1.0,0.564086,...,0.12672,-0.119844,-0.229585,-0.2667,-0.271881,-0.164033,-0.185958,-0.268053,-0.268314,-0.326574
pubs_bars,-0.274683,-0.092526,-0.17909,-0.115515,-0.100191,-0.019007,0.256563,0.551409,0.564086,1.0,...,0.038084,-0.031927,-0.212346,-0.273837,-0.324115,-0.247502,-0.230389,-0.18366,-0.21303,-0.263505


In [137]:
overlap_users = {}

In [138]:
len(df)

5456

In [139]:
cols = df.columns.to_list()

In [140]:
user_intrests = [
    'dance_clubs',
    'gyms',
    'gardens',
    'monuments',
    'cafes'
]

In [141]:
cols = [val for val in df.columns if val != "user_id"]
cols

['churches',
 'resorts',
 'beaches',
 'parks',
 'theatres',
 'museums',
 'malls',
 'zoo',
 'restaurants',
 'pubs_bars',
 'local_services',
 'burger_pizza_shops',
 'hotels_other_lodgings',
 'juice_bars',
 'art_galleries',
 'dance_clubs',
 'swimming_pools',
 'gyms',
 'bakeries',
 'beauty_spas',
 'cafes',
 'view_points',
 'monuments',
 'gardens']

In [142]:
df["user_id"] = pd.Categorical(df['user_id']).codes

In [143]:
df.set_index(df['user_id'])
df.drop('user_id', axis='columns', inplace=True)

In [144]:
df[cols[2]].info()

<class 'pandas.core.series.Series'>
RangeIndex: 5456 entries, 0 to 5455
Series name: beaches
Non-Null Count  Dtype  
--------------  -----  
5456 non-null   float64
dtypes: float64(1)
memory usage: 42.8 KB


In [145]:
val = df.gardens > 2
val

0       False
1       False
2       False
3       False
4       False
        ...  
5451    False
5452    False
5453    False
5454    False
5455    False
Name: gardens, Length: 5456, dtype: bool

In [146]:
df["churches"].max().count().astype(int)

AttributeError: 'numpy.float64' object has no attribute 'count'

In [147]:
overlap_users = {}

In [179]:
def get_user_id(column):
    return df.index[df[column]>2].to_list()

In [180]:
print(f"-- For example : {len(set(df.index[df['gyms']>2].to_list()))} people go to the Gym -- ")

-- For example : 329 people go to the Gym -- 


In [189]:
for col in cols:
    
    if col in user_intrests:
        # uid = get_user_id(col)
        if col not in overlap_users:
            overlap_users[col] = get_user_id(col)
        else:
            continue

In [None]:
overlap_users

In [103]:
for intr in user_intrests:
    print(df[intr].astype(int))

5456
5456
5455
5456
5456
