In [1]:
import pandas as pd

In [166]:
# Since some of the data in the latitude and longitude column is corrupted (e.g. showing '\\N' instead of a string in the form of a latitude)
# This function is created to be applied to each indivdual cells in the latitude and longitude column to clean up.
# It's aim is to convert the latitude / longitude in string format into float format by type casting, thereby facilitating the scaler that will be applied
# later on.  If the cell is corrupted, it will return np.nan so that the dropna() method can be applied to remove those rows later on.

def clean_up(s):
    #take a string s, if the form of the strin gis is in %d\.%d (that is, for example, "51.97039", return float(s), otherwise return np.nan
    if(re.search(r"\d+.\d+", s) != None):
        return float(s)
    else:
        return np.nan

In [168]:
# reading the csv that contains ALL UK pub info
df_uk = pd.read_csv("pubs.csv")

In [170]:
df_uk.head()

Unnamed: 0,fas_id,name,address,postcode,easting,northing,latitude,longitude,local_authority
0,24,Anchor Inn,"Upper Street, Stratford St Mary, COLCHESTER, E...",CO7 6LW,604748,234405.0,51.97039,0.979328,Babergh
1,30,Angel Inn,"Egremont Street, Glemsford, SUDBURY, Suffolk",CO10 7SA,582888,247368.0,52.094427,0.668408,Babergh
2,63,Black Boy Hotel,"7 Market Hill, SUDBURY, Suffolk",CO10 2EA,587356,241327.0,52.038683,0.730226,Babergh
3,64,Black Horse,"Lower Street, Stratford St Mary, COLCHESTER, E...",CO7 6JS,604270,233920.0,51.966211,0.972091,Babergh
4,65,Black Lion,"Lion Road, Glemsford, SUDBURY, Suffolk",CO10 7RF,582750,248298.0,52.102815,0.666893,Babergh


In [172]:
df_uk.isna().sum()

fas_id             0
name               0
address            0
postcode           0
easting            0
northing           0
latitude           0
longitude          0
local_authority    2
dtype: int64

In [173]:
# We only want to look at the londen area, so we will extract only those row with "local_authority" of the following
# list obtained from wikipedia about the greater London area
greater_london_borough = ['City of London','City of Westminster','Kensington and Chelsea',
                            'Hammersmith and Fulham','Wandsworth','Lambeth','Southwark','Tower Hamlets',
                            'Hackney','Islington','Camden','Brent','Ealing','Hounslow','Richmond upon Thames',
                            'Kingston upon Thames','Merton','Sutton','Croydon','Bromley','Lewisham','Greenwich',
                            'Bexley','Havering','Barking and Dagenham','Redbridge','Newham','Waltham Forest',
                            'Haringey','Enfield','Barnet','Harrow','Hillingdon']

In [175]:
df_greater_london = df_uk[df_uk['local_authority'].isin(greater_london_borough)]

In [176]:
df_greater_london

Unnamed: 0,fas_id,name,address,postcode,easting,northing,latitude,longitude,local_authority
1866,17283,The Three Jolly Wheelers,"735 Chigwell Road, Woodford Green, Chigwell, E...",IG8 8AS,542653,192011.0,51.608869,0.058699,Redbridge
8958,83420,Aria Bar,"31 Longbridge Road, Barking",IG11 8TN,544481,184425.0,51.54024,0.081962,Barking and Dagenham
8959,83454,Barking Arms Ltd,"25-27 Station Parade, Barking",IG11 8TU,544404,184344.0,51.539531,0.080819,Barking and Dagenham
8960,83461,Barking football Club,"Barking Football Club, Lodge Avenue, Dagenham",RM8 2JR,546614,185058.0,51.545377,0.11296,Barking and Dagenham
8961,83464,Barking Indoor Bowls Club Ltd,"Indoor Bowls Hall, Barking Prk, Longbridge Roa...",IG11 8TA,544692,184653.0,51.542225,0.085096,Barking and Dagenham
...,...,...,...,...,...,...,...,...,...
13093,143305,Wandle Pub,"332 Garratt Lane, London",SW18 4EJ,525885,173177.0,51.443613,-0.190037,Wandsworth
13143,144229,Chippenham Public House,"207 Shirland Road, London",W9 2EX,525181,182528.0,51.527798,-0.196849,Brent
13248,145728,Metropolitan Public House,"60 Great Western Road, London",W11 1AB,524919,181765.0,51.520999,-0.200895,Kensington and Chelsea
13416,147338,The Sussex Public House,"20 Upper St Martin's Lane, London",WC2H 9DL,530078,180986.0,51.512826,-0.126866,Camden


In [177]:
# the latitude column is in the form of a string
df_greater_london[['latitude', 'longitude']].dtypes

latitude     object
longitude    object
dtype: object

In [180]:
# also some are corrupted values.
df_greater_london.iloc[3494:3498, 6]

12452    51.468773
12453    51.416417
12454           \N
12455    51.464044
Name: latitude, dtype: object

In [181]:
# applying the clean up function to the latitude column
df_greater_london.loc[:,'latitude'] = df_greater_london['latitude'].apply(clean_up)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [182]:
# applying the clean up function to the longitude column
df_greater_london.loc[:,'longitude'] = df_greater_london['longitude'].apply(clean_up)

In [183]:
df_greater_london[['latitude', 'longitude']].dtypes

latitude     float64
longitude    float64
dtype: object

In [186]:
df_greater_london.isna().sum()

fas_id             0
name               0
address            0
postcode           0
easting            0
northing           0
latitude           6
longitude          6
local_authority    0
dtype: int64

In [187]:
df_greater_london.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_greater_london.dropna(inplace = True)


In [188]:
df_greater_london.isna().sum()

fas_id             0
name               0
address            0
postcode           0
easting            0
northing           0
latitude           0
longitude          0
local_authority    0
dtype: int64

In [189]:
df_greater_london.to_csv("greater_london_pub.csv")

In [190]:
pd.options.display.max_rows = 200

In [112]:
# df3 = pd.merge(g_london_pub, london1, left_on ="name", right_on = "Name")

In [140]:
# df3.to_csv("df_with_review.csv")

In [191]:
df_greater_london.shape

(4134, 9)

In [192]:
df_for_calculation = df_greater_london[['fas_id', 'latitude','longitude']]

In [193]:
df_for_calculation.set_index("fas_id", inplace = True)

In [194]:
df_for_calculation

Unnamed: 0_level_0,latitude,longitude
fas_id,Unnamed: 1_level_1,Unnamed: 2_level_1
17283,51.608869,0.058699
83420,51.540240,0.081962
83454,51.539531,0.080819
83461,51.545377,0.112960
83464,51.542225,0.085096
...,...,...
143305,51.443613,-0.190037
144229,51.527798,-0.196849
145728,51.520999,-0.200895
147338,51.512826,-0.126866


In [195]:
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import StandardScaler
from keras.preprocessing import image
import matplotlib.pyplot as plt

In [196]:
import numpy as np

In [198]:
# commencing the scaling operation on the lat and long in preparation for distance calculation
sds = StandardScaler()
df_scaled = pd.DataFrame(sds.fit_transform(g_london_pub_num_only), index = df_for_calculation.index, 
                         columns = df_for_calculation.columns)
df_scaled

Unnamed: 0_level_0,latitude,longitude
fas_id,Unnamed: 1_level_1,Unnamed: 2_level_1
17283,1.471299,1.328157
83420,0.498252,1.490129
83454,0.488200,1.482170
83461,0.571086,1.705956
83464,0.526396,1.511949
...,...,...
143305,-0.871762,-0.403696
144229,0.321845,-0.451125
145728,0.225446,-0.479296
147338,0.109566,0.036139


In [204]:
########### THE FOLLOWING CODE IS IN PROCESSING ###################





def pub_recommender(distance_method, fas_id, N, df_scaled):
    # create dataframe used to store distances between pubs
    df_distance = pd.DataFrame(data=df_scaled.index)
    
    # remove rows where index is equal to the inputted fas_id
    df_distance = df_distance[df_scaled.index != fas_id]
    
    # add a distance column that states the inputted pub's distance with every other pub
    df_distance['distance'] = df_distance["fas_id"].apply(lambda x: distance_method(df_scaled.loc[x],df_scaled.loc[fas_id]))
    
    # sort the pubs by distance and take N closes number of rows to put in the TopNRecommendation as the recommendations
    df_distance.sort_values(by='distance',inplace=True)
    
  
    # return dataframe with the inputted pub
    return(df_distance.head(N))

In [212]:
# makin a recommendation with a specific fas_id 83420, and requesting 10 recommendation
df_recommendation = pub_recommender(euclidean, 83420, 10, df_scaled)

In [210]:
for i in answer['fas_id']:
    print (i)

84283
84250
83454
83870
83464
83646
84273
84288
84281
83477
84247
129730
129734
129735
130276
84274
83461
129761
131556
130217
84282
130640
131560
131060
129896
130639
130703
131486
128564
130647
131499
83470
131528
128277
128709
129834
130642
130096
131055
129762
84145
131244
130645
83956
83623
128605
129721
131398
84256
131493
129728
83987
128660
129985
129851
130061
129681
131534
83909
83954
84120
130478
141046
84261
84248
131529
141096
129682
129707
130566
103936
105228
84121
131488
83481
131574
141060
131331
83632
131646
128308
130416
130436
128220
84265
83624
128568
131558
129741
129464
131491
131521
130470
129710
128089
130765
130823
131663
131665
129917


In [200]:
df_distance = pd.DataFrame(data=df_scaled.index)

In [201]:
df_distance

Unnamed: 0,fas_id
0,17283
1,83420
2,83454
3,83461
4,83464
...,...
4129,143305
4130,144229
4131,145728
4132,147338
