In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import confusion_matrix

In [11]:
import re

def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    '''
    alist.sort(key=natural_keys) sorts in human order
    http://nedbatchelder.com/blog/200712/human_sorting.html
    (See Toothy's implementation in the comments)
    '''
    return [ atoi(c) for c in re.split(r'(\d+)', text) ]

---

In [12]:
date_parser = lambda x: pd.datetime.strptime(x, "%d.%m.%Y %H:%M:%S")
df_predictions = pd.read_csv('kaggle_data_01.csv', parse_dates=[1], date_parser=date_parser)

In [13]:
df_predictions.head()

Unnamed: 0,person_id,date,Id,Prediction1
0,person_9,2017-06-02 10:03:36,1,3.0
1,person_9,2017-06-09 08:34:08,2,3.0
2,person_9,2017-06-16 10:19:03,3,3.0
3,person_9,2017-06-23 12:08:57,4,3.0
4,person_9,2017-06-30 12:48:39,5,4.0


In [14]:
df_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 4 columns):
person_id      414 non-null object
date           414 non-null datetime64[ns]
Id             414 non-null int64
Prediction1    258 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 13.0+ KB


In [15]:
persons = df_predictions['person_id'].unique()

In [16]:
for person in persons:
    df_predictions[person] = pd.Series(data=(df_predictions['person_id'] == person).astype('int64'), index=df_predictions.index)
df_predictions.drop(columns=['person_id', 'Id'], inplace=True)
df_predictions.set_index('date', inplace=True)
df_predictions.head()

Unnamed: 0_level_0,Prediction1,person_9,person_5,person_1,person_2,person_10,person_6,person_12,person_7,person_8,person_3,person_11,person_13,person_4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2017-06-02 10:03:36,3.0,1,0,0,0,0,0,0,0,0,0,0,0,0
2017-06-09 08:34:08,3.0,1,0,0,0,0,0,0,0,0,0,0,0,0
2017-06-16 10:19:03,3.0,1,0,0,0,0,0,0,0,0,0,0,0,0
2017-06-23 12:08:57,3.0,1,0,0,0,0,0,0,0,0,0,0,0,0
2017-06-30 12:48:39,4.0,1,0,0,0,0,0,0,0,0,0,0,0,0


---

In [6]:
date_parser = lambda x: pd.datetime.strptime(x, "%Y-%m-%d")

df_data = pd.read_csv('kaggle_data_02.csv', header=None, parse_dates=[1], date_parser=date_parser)
header = list(np.arange(df_data.shape[1]))
header[0] = 'person_id'
header[1] = 'date'
df_data.columns = header

In [7]:
df_data.head()

Unnamed: 0,person_id,date,2,3,4,5,6,7,8,9,...,458,459,460,461,462,463,464,465,466,467
0,person_1,2017-06-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,person_1,2017-06-02,29.0,12.0,37.0,32.0,128.0,16.0,2.0,15.0,...,16.0,0.0,57.0,15.0,16.0,3.0,0.0,21.0,7.0,10.0
2,person_1,2017-06-03,10.0,4.0,10.0,8.0,34.0,4.0,1.0,4.0,...,4.0,0.0,11.0,4.0,4.0,1.0,2.0,19.0,6.0,6.0
3,person_1,2017-06-04,3.0,3.0,3.0,6.0,40.0,3.0,1.0,3.0,...,3.0,0.0,6.0,3.0,3.0,0.0,1.0,16.0,4.0,4.0
4,person_1,2017-06-05,4.0,3.0,12.0,11.0,62.0,4.0,1.0,4.0,...,4.0,0.0,17.0,4.0,4.0,2.0,2.0,20.0,6.0,7.0


In [8]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2379 entries, 0 to 2378
Columns: 468 entries, person_id to 467
dtypes: datetime64[ns](1), float64(464), int64(2), object(1)
memory usage: 8.5+ MB


----

In [10]:
df = df_predictions.groupby(['person_id']).count()
new_persons = df.loc[df['Prediction1'] == 0].index.values
known_persons = df.loc[df['Prediction1'] != 0].index.values

In [11]:
predictions = []
for person in persons:
    df = df_predictions[df_predictions['person_id'] == person].drop(columns=['person_id', 'Id'])
    df.set_index('date', inplace=True)
    predictions.append(df)

In [12]:
all_data = []
for person in persons:
    df = df_data[df_data['person_id'] == person].drop(columns=['person_id'])
    df.set_index('date', inplace=True)
    header = list(np.arange(df.shape[1]))
    df.columns = header
    all_data.append(df)

In [26]:
feature_matrices = []
label_vectors = []

for idx, (prediction, data) in enumerate(zip(predictions, all_data)):
    features = []
    for idx_date, date_value in enumerate(prediction.index):
        start = end if (idx_date > 0) else (date_value - pd.to_timedelta('7 days'))
        end = date_value
        mask = (data.index > start) & (data.index <= end)
        features.append(data.loc[mask])
    
    feature_matrices.append(features)
    label_vectors.append(prediction.values)

---

In [41]:
label_vectors[12]

array([[3.],
       [3.],
       [4.],
       [3.],
       [3.],
       [4.],
       [3.],
       [3.],
       [3.],
       [4.],
       [3.],
       [3.],
       [3.],
       [4.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.],
       [3.]])

In [14]:
for check in data_by_checks:
    for data in check:
        labels = data[data.columns[-1]].values


In [38]:
data = data_by_checks[0][0]
feature_matrix = data.values

In [43]:
labels = predictions[0][0]
feature_matrix.shape[0]

KeyError: 0

In [42]:
labels

Unnamed: 0_level_0,Prediction1
date,Unnamed: 1_level_1
2017-06-02 10:03:36,3.0
2017-06-09 08:34:08,3.0
2017-06-16 10:19:03,3.0
2017-06-23 12:08:57,3.0
2017-06-30 12:48:39,4.0
2017-07-07 13:33:31,3.0
2017-07-14 05:14:13,3.0
2017-07-21 05:10:07,3.0
2017-07-28 05:19:23,3.0
2017-08-04 05:38:31,3.0


In [36]:
from sklearn.model_selection import train_test_split

In [37]:
train_feature_matrix, test_feature_matrix, train_labels, test_labels = train_test_split(feature_matrix, labels, test_size=0.2, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [7, 2]

In [25]:
known_persons_data = data[data['person_id'].isin(known_persons)]

In [37]:
target[target['person_id'] == 'person_8']['date'].dt.weekday

208    4
209    4
210    4
211    4
212    4
213    4
214    5
215    4
216    4
217    4
218    4
219    5
220    4
221    4
222    4
223    5
224    5
225    4
226    4
227    4
228    4
229    5
230    4
231    4
232    4
233    4
Name: date, dtype: int64

In [67]:
data[data['person_id'] == 'person_1']['date']

0     2017-06-01
1     2017-06-02
2     2017-06-03
3     2017-06-04
4     2017-06-05
5     2017-06-06
6     2017-06-07
7     2017-06-08
8     2017-06-09
9     2017-06-10
10    2017-06-11
11    2017-06-12
12    2017-06-13
13    2017-06-14
14    2017-06-15
15    2017-06-16
16    2017-06-17
17    2017-06-18
18    2017-06-19
19    2017-06-20
20    2017-06-21
21    2017-06-22
22    2017-06-23
23    2017-06-24
24    2017-06-25
25    2017-06-26
26    2017-06-27
27    2017-06-28
28    2017-06-29
29    2017-06-30
         ...    
153   2017-11-01
154   2017-11-02
155   2017-11-03
156   2017-11-04
157   2017-11-05
158   2017-11-06
159   2017-11-07
160   2017-11-08
161   2017-11-09
162   2017-11-10
163   2017-11-11
164   2017-11-12
165   2017-11-13
166   2017-11-14
167   2017-11-15
168   2017-11-16
169   2017-11-17
170   2017-11-18
171   2017-11-19
172   2017-11-20
173   2017-11-21
174   2017-11-22
175   2017-11-23
176   2017-11-24
177   2017-11-25
178   2017-11-26
179   2017-11-27
180   2017-11-

In [82]:
data[data['person_id'] == 'person_1'].iloc[4][2:].max()

8051.0

In [None]:
for person in known_persons:
    data[data['person_id'] == person].