# Banners task:

### Task

#### There is company that wants to place its 20 adverb banners effectively, so that as many people as possible can see this.

In [2]:
!ls

answer.txt  banners_profit.ipynb  data	task


In [3]:
!ls data

checkins.csv  checkins.dat  fsq


So we have .dat file, there is the data from www.foursquare.com - we can see where people are often to walk.
We need to convert .dat into .csv so the work with data will be comfortable:

In [4]:
import csv

In [9]:
with open('data/checkins.dat', 'r') as dat_file:
    newLines = []
    for line in dat_file:
        newLine = [x.strip() for x in line.split('|')]
        if len(newLine) == 6 and newLine[3] and newLine[4]:
            newLines.append(newLine)
            
with open('data/checkins.csv', 'w') as csv_file:
    file_writer = csv.writer(csv_file)
    file_writer.writerows(newLines)

Now let's see what useful information we can extract from the data:

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('data/checkins.csv')

In [7]:
df.describe()

Unnamed: 0,id,user_id,venue_id,latitude,longitude
count,396634.0,396634.0,396634.0,396634.0,396634.0
mean,510471.1,564134.9,132755.9,37.09707,-86.354065
std,305492.7,486569.4,228700.3,8.77876,37.983677
min,16.0,1.0,1.0,-75.250973,-159.670833
25%,242279.2,169290.0,7620.0,33.800745,-111.926052
50%,501987.5,429820.0,28304.0,39.099275,-85.758456
75%,775817.5,849647.0,148552.0,40.802071,-74.05653
max,1021981.0,2153361.0,1143011.0,78.21859,178.42424


In [8]:
df.head()

Unnamed: 0,id,user_id,venue_id,latitude,longitude,created_at
0,984222,15824,5222,38.895112,-77.036366,2012-04-21 17:43:47
1,984234,44652,5222,33.800745,-84.41052,2012-04-21 17:43:43
2,984291,105054,5222,45.523452,-122.676207,2012-04-21 17:39:22
3,984318,2146539,5222,40.764462,-111.904565,2012-04-21 17:35:46
4,984232,93870,380645,33.448377,-112.074037,2012-04-21 17:38:18


In [9]:
import numpy as np
from sklearn.cluster import MeanShift

We will use MeanShift clustering algorithm to find the centers of people crowds:

In [10]:
ms = MeanShift(bandwidth=0.1)

In [11]:
X = df.values
print(X.shape)

# we need only the coordinates
coordinates = X[:, 3:5]
print(coordinates.shape)
print(coordinates[:5])

(396634, 6)
(396634, 2)
[[38.895111799999995 -77.0363658]
 [33.800745 -84.41051999999999]
 [45.5234515 -122.6762071]
 [40.764462 -111.90456499999999]
 [33.4483771 -112.07403729999999]]


In [12]:
ms.fit(coordinates[:100001])

MeanShift(bandwidth=0.1, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [13]:
centers = ms.cluster_centers_
print(centers.shape)
print(centers[0])

(3230, 2)
[ 40.7177164  -73.99183542]


We want to know how many people are there in each cluster, because we need the most "crowded" places:

In [14]:
labels = ms.labels_
print(labels.shape)
unique_labels, counts = np.unique(labels, return_counts=True)
print(counts[:5])

(100001,)
[12506  4692  3994  3363  3527]


In [15]:
centers_counts = np.asarray((unique_labels, counts)).T
print(centers_counts)

[[    0 12506]
 [    1  4692]
 [    2  3994]
 ..., 
 [ 3227     1]
 [ 3228     1]
 [ 3229     1]]


We don`t need clusters where number of people is less then 15:

In [16]:
interesting_centers = np.array([x for x in centers_counts if x[1] > 15])
print(interesting_centers.shape)
print(interesting_centers[:10])

(591, 2)
[[    0 12506]
 [    1  4692]
 [    2  3994]
 [    3  3363]
 [    4  3527]
 [    5  2409]
 [    6  2297]
 [    7  1601]
 [    8  1526]
 [    9  1378]]


#### We know coordinates of the offices of this company:

In [17]:
offices = np.matrix([[33.751277, -118.188740], 
                     [25.867736, -80.324116], 
                     [51.503016, -0.075479], 
                     [52.378894, 4.885084], 
                     [39.366487, 117.036146], 
                     [-33.868457, 151.205134]])
print(offices.shape)

(6, 2)


In [18]:
profit_centers = []
for i, center in enumerate(centers):
    if i in interesting_centers[:, 0]:
        profit_centers.append(center)
profit_centers = np.array(profit_centers)

In [19]:
print(profit_centers.shape)
print(profit_centers[:5])

(591, 2)
[[  40.7177164   -73.99183542]
 [  33.44943805 -112.00213969]
 [  33.44638027 -111.90188756]
 [  41.87824378  -87.62984336]
 [  37.68868157 -122.40933037]]


Now we can find 20 closest (to offices) crowd centers and locate there our banners:

In [20]:
from scipy import spatial
all_distances = []
for num, center in enumerate(profit_centers):
    for office_point in offices:
        all_distances.append((spatial.distance.euclidean(center, office_point), num))

In [22]:
ordered = sorted(all_distances)

In [23]:
print(ordered[:20])

[(0.007834758163107854, 408), (0.009353316185992226, 373), (0.02267406615838222, 417), (0.05005829482278787, 58), (0.07084773242717578, 51), (0.13410903336184657, 29), (0.16740596425035667, 167), (0.1888759606018508, 92), (0.1957794564776363, 87), (0.21181053682436796, 42), (0.22223329073179776, 291), (0.27130075950667704, 316), (0.29497888680045686, 119), (0.3022701186924605, 55), (0.3047305030784069, 27), (0.3148837903362732, 11), (0.33881047025113176, 32), (0.3408456533220572, 158), (0.37868750125029754, 17), (0.3867062248427277, 47)]


In [24]:
with open('answer.txt', 'w') as fout:
    fout.write(str(profit_centers[408][0]) + ' ' + str(profit_centers[408][1]))

<b>Answer</b>: Coordinates of the closest and "enough crowded" place to locate the banner:

In [28]:
print(profit_centers[408])

[ -33.86063043  151.20477593]
