In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_filepath = '/kaggle/input/facebook-v-predicting-check-ins/train.csv.zip'
test_filepath = '/kaggle/input/facebook-v-predicting-check-ins/test.csv.zip'
df = pd.read_csv(train_filepath)
df = df.set_index('row_id')
holdout_df = pd.read_csv(test_filepath)
holdout_df = holdout_df.set_index('row_id')
holdout_df.head()

In [None]:
df_train_sample = df.sample(n=100000)

In [None]:
X_cols = ['x', 'y', 'accuracy', 'time']
y_cols = ['place_id']

X = df[X_cols]
y = df[y_cols]

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=6)
model.fit(X)
clusters = model.predict(X)
clustering_df = X.copy()
clustering_df['cluster'] = clusters
clustering_df.head()

In [None]:
first_pl = Pipeline(steps=[
    ('std_scaler', StandardScaler()),
    ('kmeans', KMeans(n_clusters=5))
])
first_pl.fit(X)
new_clusters = first_pl.predict(X)

In [None]:
clustering_df = X.copy()
clustering_df['cluster'] = new_clusters
for cluster in range(0,5):
    current_cluster = clustering_df[clustering_df['cluster'] == cluster]
    print('CLUSTER: ', cluster)
    plt.figure()
    plt.scatter(current_cluster['time'], current_cluster['accuracy'])
    plt.xlabel('time')
    plt.ylabel('accuracy')
    plt.show()

In [None]:
feature_df = X.copy()
feature_df.head()
feature_df['grid_loc'] = df['x'] * 1000 + (9 * 1000 * round(df['y']))
feature_df.sort_values(by='grid_loc', ascending=True)

In [None]:
plt.figure()
plt.scatter(feature_df['grid_loc'], y.values)
plt.xlabel('Grid Location')
plt.ylabel('Place ID')
plt.show()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class GridConverter(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        x_coord, y_coord, accuracy, time = 0, 1, 2, 3
        new_col = X[:,x_coord] * 1000 + (9 * 1000 * X[:, y_coord])
        return np.c_[X, new_col]

In [None]:
from sklearn.neighbors import KNeighborsClassifier
second_pl = Pipeline(steps=[
    ('std_scaler', StandardScaler()),
    ('grid_converter', GridConverter()),
    ('knn', KNeighborsClassifier(n_neighbors=5))
])

second_pl.fit(X, y)
predictions = second_pl.predict(holdout_df)
predictions_df = holdout_df.copy()
predictions_df['predictions'] = predictions
predictions_df.head()

In [None]:
new_df = predictions_df['predictions']
new_df.rename({'predictions': 'place_id'})
new_df.to_csv('/kaggle/working/df.csv')

In [None]:
new_df