In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import confusion_matrix

---

In [2]:
date_parser = lambda x: pd.datetime.strptime(x, "%d.%m.%Y %H:%M:%S")
df_predictions = pd.read_csv('kaggle_data_01.csv', parse_dates=[1], date_parser=date_parser)

In [3]:
df_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 4 columns):
person_id      414 non-null object
date           414 non-null datetime64[ns]
Id             414 non-null int64
Prediction1    258 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 13.0+ KB


In [4]:
persons = df_predictions['person_id'].unique()

df = df_predictions.groupby(['person_id']).count()
known_persons = df.loc[df['Prediction1'] != 0].index.values
target_persons = df.loc[df['Prediction1'] == 0].index.values

---

In [5]:
date_parser = lambda x: pd.datetime.strptime(x, "%Y-%m-%d")

df_data = pd.read_csv('kaggle_data_02.csv', header=None, parse_dates=[1], date_parser=date_parser)
header = list(np.arange(df_data.shape[1] - 2))
header = ['person_id', 'date'] + header
df_data.columns = header

In [6]:
df_data.head()

Unnamed: 0,person_id,date,0,1,2,3,4,5,6,7,...,456,457,458,459,460,461,462,463,464,465
0,person_1,2017-06-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,person_1,2017-06-02,29.0,12.0,37.0,32.0,128.0,16.0,2.0,15.0,...,16.0,0.0,57.0,15.0,16.0,3.0,0.0,21.0,7.0,10.0
2,person_1,2017-06-03,10.0,4.0,10.0,8.0,34.0,4.0,1.0,4.0,...,4.0,0.0,11.0,4.0,4.0,1.0,2.0,19.0,6.0,6.0
3,person_1,2017-06-04,3.0,3.0,3.0,6.0,40.0,3.0,1.0,3.0,...,3.0,0.0,6.0,3.0,3.0,0.0,1.0,16.0,4.0,4.0
4,person_1,2017-06-05,4.0,3.0,12.0,11.0,62.0,4.0,1.0,4.0,...,4.0,0.0,17.0,4.0,4.0,2.0,2.0,20.0,6.0,7.0


---

one hot encoding attempt

In [7]:
for person in persons:
    df_data[person] = pd.Series(data=(df_data['person_id'] == person).astype('float'), index=df_data.index)

features

In [8]:
X_labels = np.delete(df_data.columns.values, [0, 1])

In [9]:
df_X = df_data[df_data['person_id'].isin(known_persons)]
X = df_X[X_labels].values
X[np.isnan(X)] = 0

In [10]:
df_X_target = df_data[df_data['person_id'].isin(target_persons)]
X_target = df_X_target[X_labels].values
X_target[np.isnan(X_target)] = 0

answers

In [11]:
def get_prediction(person, date):
    
    df = df_predictions[df_predictions['person_id'] == person]
    
    for idx in range(df.shape[0]):
        start_date = end_date if (idx > 0) else (df.iloc[idx]['date'] - pd.to_timedelta('7 days'))
        end_date = df.iloc[idx]['date']
        mask = (date > start_date) & (date <= end_date)
        if mask:
            return df.iloc[idx]['Prediction1'] 
    return 0

In [12]:
y = np.zeros(X.shape[0])
y_target = np.zeros(X_target.shape[0])

In [13]:
for idx in range(X.shape[0]):
    y[idx] = get_prediction(df_X.iloc[idx]['person_id'], df_X.iloc[idx]['date'])
    #print('person = {}, date = {}, val = {}'.format(df_X.iloc[idx]['person_id'], df_X.iloc[idx]['date'], y[idx]))

---

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Logistic Regression

In [16]:
clf = LogisticRegression(solver='saga', max_iter=200, multi_class='multinomial')

param_grid = {'C': np.arange(1, 5), 'penalty': ['l1', 'l2']}

search = GridSearchCV(clf, param_grid, n_jobs=-1, cv=5, refit=True, scoring='accuracy')
search.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1, 2, 3, 4]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [17]:
search.best_params_

{'C': 3, 'penalty': 'l2'}

In [18]:
accuracy_score(y_train, search.best_estimator_.predict(X_train))

0.5635245901639344

In [24]:
optimal_clf = LogisticRegression(C=3, penalty='l2', solver='saga', max_iter=200, multi_class='multinomial')
optimal_clf.fit(X_train, y_train)
pred_prob = optimal_clf.predict_proba(X_test)



In [25]:
pred_prob

array([[1.82303191e-03, 3.93654722e-02, 5.55126205e-02, 8.20919688e-01,
        8.01838760e-02, 2.19531131e-03],
       [4.78618194e-03, 8.15590737e-02, 3.25747731e-02, 2.02872916e-02,
        7.94969185e-01, 6.58234950e-02],
       [2.73111315e-04, 8.71674652e-03, 7.01162711e-01, 4.12115116e-02,
        2.46791657e-01, 1.84426270e-03],
       ...,
       [1.28463536e-01, 2.54449341e-02, 1.27131907e-01, 6.32734254e-01,
        5.36400920e-02, 3.25852765e-02],
       [4.08065181e-03, 1.30562908e-02, 6.02840759e-01, 2.14474663e-01,
        1.64015926e-01, 1.53170940e-03],
       [3.79482808e-02, 3.37482089e-01, 1.89276665e-01, 2.04731216e-01,
        1.96347207e-01, 3.42145424e-02]])

---

K-Neighbors Classifier

In [20]:
clf = KNeighborsClassifier()

param_grid = {'weights': ['uniform', 'distance'], 'n_neighbors': np.arange(10, 20), 'metric': ['manhattan', 'euclidean']}

search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)
search.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   52.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 'metric': ['manhattan', 'euclidean']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [21]:
search.best_params_

{'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'distance'}

In [22]:
optimal_clf = KNeighborsClassifier(n_neighbors=13, metric='manhattan', weights='distance')
optimal_clf.fit(X_train, y_train)
pred_prob = optimal_clf.predict_proba(X_test)

In [23]:
pred_prob

array([[0.        , 0.        , 0.        , 0.92900969, 0.07099031,
        0.        ],
       [0.07118421, 0.0733502 , 0.        , 0.        , 0.71389433,
        0.14157125],
       [0.        , 0.        , 0.76070257, 0.15971696, 0.07958047,
        0.        ],
       ...,
       [0.15659164, 0.07815148, 0.15334241, 0.53662788, 0.07528658,
        0.        ],
       [0.        , 0.        , 0.46211895, 0.30677005, 0.23111101,
        0.        ],
       [0.159373  , 0.07960921, 0.15239564, 0.3808546 , 0.07592782,
        0.15183974]])

---