In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore') 
sns.set_style('whitegrid')
pd.set_option('display.max_columns', None) # display all columns

In [None]:
# importing the dataset
types = {'row_id': np.dtype(int),
         'x': np.dtype(float),
         'y' : np.dtype(float),
         'accuracy': np.dtype(int),
         'place_id': np.dtype(int) }
#This will ensure that pandas is loading the data into the right objects (not strings for instance, which can take up a lot of memory)

df_train = pd.read_csv('../input/train.csv',dtype=types, index_col='row_id')
df_test = pd.read_csv('../input/test.csv', index_col='row_id')
df_train.head(3)

In [None]:
print('Reading train data')
print('\nSize of training data: ' + str(df_train.shape))
print('Columns:' + str(df_train.columns.values))
print('Number of places: ' + str(len(list(set(df_train['place_id'].values.tolist())))))
print('\n')
print('dtypes')
print('\n')
print(df_train.dtypes)
print('\n')
print('Info: ')
print('\n')
print(df_train.info)
print('Shape: ')
print('\n')
print(df_train.shape)
print('\n')
print('numerical columns statistcs')
print('\n')
print(df_train.describe())

Exploring the columns with Permutation and Random Sampling
To select a random subset without replacement, one way is to slice off the k elements of an array returned by permutation, where k is the desired subet size.

In [None]:
sampler = np.random.permutation(5)
df_train.take(sampler)

In [None]:
randomSample = df_train.take(np.random.permutation(len(df_train))[:3])
randomSample

row_id

The primary key for our data.

x, y

We're told this is a 10 km x 10 km square and, indeed, x and y run between 0 and 10 so it’s probably fair to assume the units of x and y are kilometers. The precision of x and y is 0.0001 km which is 10 cm (that’s 4 in in Menlo Park).
accuracy

This is interesting. accuracy varies between 1-1033 with an average of 82.8. It's given as an integer which is odd.
time

Again an integer. Based on previous analyses, we suspect its units of are minutes.

place_id

place_id ranges between 109109 and 10101010 . There are 9×1099×109 possible unique values but only 2.9×1072.9×107 were used, 0.3% of the availability.

In [None]:
nb_total = df_train.place_id.count()
nb_unique = df_train.place_id.drop_duplicates().count()

print('Number place_ids: {}'.format(nb_total))
print('Unique place_ids: {}'.format(nb_unique))
print("Average number of duplicates: %.1f" % (nb_total/nb_unique))

Part 2 - Quick visualisations

In [None]:
f, axarr = plt.subplots(5, figsize=(15, 25))

sns.distplot(df_train['x'], ax=axarr[0])
sns.distplot(df_train['y'], ax=axarr[1])
sns.distplot(df_train['accuracy'], ax=axarr[2])
sns.distplot(df_train['time'], ax=axarr[3])
sns.distplot(df_train['place_id'], ax=axarr[4])


axarr[0].set_title('x')
axarr[1].set_title('y')
axarr[2].set_title('accuracy')
axarr[3].set_title('time')
axarr[4].set_title('place_id')

plt.tight_layout()
plt.show()

The two dips of time in training set are curious, if looking at counts per unit time they might need to be normalised.

OK, so most places appear around 100 times.