### Clustering London Boroughs by Ethnic Makeup

In [210]:
# IMPORT DATA
import pandas as pd
from sklearn.cluster import KMeans, MeanShift, DBSCAN
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
pd.options.display.max_colwidth = 1000

In [211]:
# LOAD DATA
boroughs = pd.read_excel('../data/raw/ethnic-groups-by-borough.xls', sheet_name=['2012', '2013', '2014', '2015',
                                                                                '2016', '2017'])

In [212]:
# FIX COLUMNS
borough_columns = ['area', 'white', 'asian', 'black', 'mixed_other', 'total']

In [213]:
# CREATE FULL DATAFRAME OF ALL BOROUGHS BY YEAR
london_boroughs = pd.DataFrame(columns=borough_columns)
for x in boroughs:
    temp_df = boroughs[x].iloc[3:35, 1:7]
    temp_df.columns = borough_columns
    temp_df['year'] = int(x)
    london_boroughs = pd.concat([temp_df, london_boroughs], axis=0).reset_index(drop=True)

In [214]:
# WHICH YEARS ARE WE USING?
london_boroughs['year'].value_counts()

2015.0    32
2014.0    32
2013.0    32
2012.0    32
2016.0    32
2017.0    32
Name: year, dtype: int64

In [215]:
# VISUALIZE DATAFRAME AFTER CREATION - FIX DATA ISSUES WITH DASHES ETC.
london_bor_unlabeled = london_boroughs.apply(lambda x: (x.replace('-', 0)))

In [216]:
# SET UP A TIME SERIES SPLITS
cluster_holdout = london_bor_unlabeled[london_bor_unlabeled.year >= 2016]
cluster_training = london_bor_unlabeled[london_bor_unlabeled.year < 2016]

In [217]:
# SET UP GROUPBY DICTIONARY
groupby_dict = {'area': 'unique',
'white': 'mean',
'black': 'mean',
'mixed_other': 'mean',
'asian': 'mean'}

In [220]:
# TRY KMEANS CLUSTERING WITH DEFAULT K OF 8 CLUSTERS
kmeans = KMeans()
cluster_training['cluster'] = kmeans.fit_predict(cluster_training.drop(['area','year'], axis=1))
cluster_training.groupby('cluster').agg(groupby_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,area,white,black,mixed_other,asian
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[Bromley, Enfield, Lambeth, Wandsworth]",224250.0,41562.5,23062.5,24500.0
1,"[Greenwich, Hackney, Haringey, Hillingdon, Lewisham, Southwark, Waltham Forest]",157518.518519,50851.851852,27370.37037,36888.888889
2,"[Harrow, Hounslow, Redbridge, Tower Hamlets, Hillingdon]",119588.235294,17235.294118,22235.294118,108058.823529
3,"[Camden, Islington, Merton, Richmond upon Thames, Sutton, Westminster]",144208.333333,15541.666667,24375.0,24541.666667
4,"[Barnet, Croydon, Ealing]",213000.0,41636.363636,37909.090909,68181.818182
5,"[Bexley, Havering]",201125.0,18375.0,7625.0,10750.0
6,"[Barking and Dagenham, Hammersmith and Fulham, Kensington and Chelsea, Kingston upon Thames]",112937.5,20687.5,17812.5,20500.0
7,"[Brent, Newham, Ealing]",103777.777778,50777.777778,45666.666667,121777.777778


In [221]:
# PREDICTING ON OUR HOLD-OUT SET
cluster_holdout['cluster'] = kmeans.predict(cluster_holdout.drop(['area', 'year'], axis=1))
cluster_holdout.groupby('cluster').agg(groupby_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,area,white,black,mixed_other,asian
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[Bromley, Enfield, Lambeth, Lewisham, Southwark, Wandsworth]",216818.181818,48727.272727,28000.0,26272.727273
1,"[Camden, Greenwich, Hackney, Haringey, Waltham Forest, Hillingdon, Southwark]",164818.181818,44363.636364,34181.818182,35363.636364
2,"[Harrow, Hillingdon, Hounslow, Redbridge, Tower Hamlets]",127142.857143,19142.857143,23000.0,110571.428571
3,"[Islington, Merton, Richmond upon Thames, Sutton, Westminster, Camden]",145454.545455,17727.272727,28727.272727,26000.0
4,"[Barnet, Croydon]",231000.0,42750.0,40250.0,70000.0
5,"[Bexley, Havering]",205750.0,19250.0,8500.0,14750.0
6,"[Barking and Dagenham, Hammersmith and Fulham, Kensington and Chelsea, Kingston upon Thames]",113000.0,22750.0,18000.0,25250.0
7,"[Brent, Ealing, Newham, Redbridge, Tower Hamlets]",127250.0,39375.0,36375.0,126625.0


In [222]:
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr

ImportError: dlopen(/anaconda3/lib/python3.6/site-packages/rpy2/rinterface/_rinterface.cpython-36m-darwin.so, 2): Library not loaded: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libR.dylib
  Referenced from: /anaconda3/lib/python3.6/site-packages/rpy2/rinterface/_rinterface.cpython-36m-darwin.so
  Reason: Incompatible library version: _rinterface.cpython-36m-darwin.so requires version 3.5.0 or later, but libR.dylib provides version 3.4.0