In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
date_parser = lambda x: pd.datetime.strptime(x, "%d.%m.%Y %H:%M:%S")
df_targets = pd.read_csv('kaggle_data_01.csv', parse_dates=[1], date_parser=date_parser)

In [3]:
persons = df_targets['person_id'].unique()

df = df_targets.groupby(['person_id']).count()
known_persons = df.loc[df['Prediction1'] != 0].index.values
unknown_persons = df.loc[df['Prediction1'] == 0].index.values

---

In [4]:
date_parser = lambda x: pd.datetime.strptime(x, "%Y-%m-%d")

df_data = pd.read_csv('kaggle_data_02.csv', header=None, parse_dates=[1], date_parser=date_parser)
header = list(np.arange(df_data.shape[1] - 2))
header = ['person_id', 'date'] + header
df_data.columns = header

---

In [5]:
# Function to calculate missing values by column
def missing_values_table(df):
        mis_val = df.isnull().sum()       
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing', 1 : '% of Total'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total', ascending=False).round(1)
        return mis_val_table_ren_columns

In [6]:
df_missing_known = missing_values_table(df_data[df_data['person_id'].isin(known_persons)])
df_missing_known.head(10)

Unnamed: 0,Missing,% of Total
138,347,19.0
74,212,11.6
100,161,8.8
40,95,5.2
160,81,4.4
166,62,3.4
256,49,2.7
180,47,2.6
190,44,2.4
98,38,2.1


In [7]:
df_missing_unknown = missing_values_table(df_data[df_data['person_id'].isin(unknown_persons)])
df_missing_unknown.head(10)

Unnamed: 0,Missing,% of Total
90,195,35.5
160,102,18.6
84,29,5.3
76,15,2.7
88,11,2.0
80,9,1.6
86,9,1.6
190,9,1.6
74,5,0.9
16,4,0.7


In [8]:
# столбцы, в которых много пропущенных значений, может быть стоит убрать из рассмотрения
# остальные значения обнулим

df_data.fillna(0);

In [9]:
# пронормируем фичи
from sklearn.preprocessing import StandardScaler

columns = list(np.arange(df_data.shape[1] - 2))
df_data_scaler = df_data[columns]
scaler = StandardScaler()
df_data_scaler = pd.DataFrame(scaler.fit_transform(df_data_scaler), columns=columns)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [10]:
df_data_scaler['person_id'] = df_data['person_id']
df_data_scaler['date'] = df_data['date']

---

In [11]:
# разобьем выборку по временным интервалам между выставлениями оценок для каждого пользователя

# интервалы времени между оценками по пользователю
# возвращаем даты начала и окончания диапазона + Id записи

def get_ranges(data, person):
    ranges = []
    df = data[data['person_id'] == person]
    for idx in range(df.shape[0]):
        start_date = end_date if (idx > 0) else (df.iloc[idx]['date'] - pd.to_timedelta('7 days'))
        end_date = df.iloc[idx]['date']
        ranges.append([start_date, end_date, df.iloc[idx]['Id']])
    return ranges

In [12]:
# данные фич по пользователю за заданный интервал времени
# dates[0] - start date
# dates[1] - end date

def get_range_data(data, person, dates):
    df = data[data['person_id'] == person]
    mask = (df['date'] > dates[0]) & (df['date'] < dates[1])
    return df[mask]

In [229]:
# Усредняем каждую фичу за интервал между выставлениями оценок пользователем

def data_mean(persons, targets, data):
    X = []
    Ids = []
    for person in persons:
        ranges = get_ranges(targets, person)
        for ra in ranges:
            da = get_range_data(data, person, ra)              
            xa = da.mean() if not da.empty else 0
            X.append(xa)
    return [X, Ids]

---

In [222]:
# выделим выборки целевых значений

y_true = df_targets[df_targets['person_id'].isin(known_persons)]
y_target = df_targets[df_targets['person_id'].isin(unknown_persons)]

In [223]:
def answer_to_file(predicted, Ids, file_name):    
    df_answer = pd.DataFrame(data=Ids, columns=['Id'])
    df_answer['Prediction1'] = predicted
    df_answer.to_csv(file_name, index=False)

---

In [224]:
# Вариант 0. Предсказывать медианное значение по исходной выборке

y_pred = y_true['Prediction1'].quantile(q=0.50) * np.ones(y_target.shape[0])
answer_to_file(y_pred, y_target['Id'].values, 'baseline.csv')

---

In [230]:
# Вариант 1. Усредняем каждую фичу за интервал между выставлениями оценок пользователем. Дальше - используем линейную регрессию

# Добавляем вектор из единиц для коэффициента w0 линейной регрессии.
df_data_scaler['bias'] = 1

X_true = data_mean(known_persons, df_targets, df_data_scaler)
X_target = data_mean(unknown_persons, df_targets, df_data_scaler)

In [226]:
# удаляем оценки, по которым нет предшествующих данных

#y_true = y_true.drop(y_true[y_true['Id'].isin(Ids_true)].index)
#y_target = y_target.drop(y_target[y_target['Id'].isin(Ids_target)].index)

In [241]:
# обнуляем пустые значения фич

X_true = np.array(X_true)
X_true[np.isnan(X_true)] = 0

X_target = np.array(X_target)
X_target[np.isnan(X_target)] = 0

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [214]:
X_train, X_test, y_train, y_test = train_test_split(X_true, y_true['Prediction1'].values, test_size=0.2, random_state=42)

---
Linear Regression

In [175]:
from sklearn.linear_model import LinearRegression

In [192]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [193]:
linear_reg.score(X_test, y_test)

-1.4209208003885672

In [194]:
y_pred = linear_reg.predict(X_target)

In [195]:
y_pred.round()

array([ 4.,  4.,  4.,  4.,  3.,  6.,  6.,  3.,  6.,  6.,  4.,  7.,  6.,
        6.,  5.,  9., 10., 10.,  5.,  5.,  4.,  4.,  4.,  4.,  5.,  4.,
        4.,  4.,  3.,  6.,  6.,  3.,  6.,  6.,  4.,  7.,  6.,  6.,  5.,
        9., 10., 10.,  5.,  5.,  4.,  4.,  4.,  4.,  5.,  3.,  5.,  4.,
        4.,  2.,  4.,  1.,  3.,  3.,  6.,  3.,  4.,  4.,  2.,  3.,  4.,
        1.,  6.,  2.,  4.,  2.,  2.,  1.,  3.,  3.,  2.,  2.,  5.,  4.,
        4.,  2.,  4.,  1.,  3.,  3.,  6.,  3.,  4.,  4.,  2.,  3.,  4.,
        1.,  6.,  2.,  4.,  2.,  2.,  1.,  3.,  3.,  2.,  2., 10.,  2.,
        4.,  2.,  3.,  4.,  4.,  7.,  5.,  3.,  0.,  4.,  1.,  2.,  3.,
        2.,  5.,  3.,  3.,  3.,  4.,  4.,  4.,  5.,  4.,  4.,  2.,  4.,
        2.,  3.,  4.,  4.,  7.,  5.,  3.,  0.,  4.,  1.,  2.,  3.,  2.,
        5.,  3.,  3.,  3.,  4.,  4.,  4.,  5.,  4.,  4.])

In [74]:
answer_to_file(y_pred, y_target['Id'].values, 'linear_regression.csv')

---
Random Forest Regressor

In [202]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [189]:
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [197]:
forest_reg.score(X_test, y_test)

0.09767736821244544

---
Gradient boosting

In [198]:
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
boost_reg = ensemble.GradientBoostingRegressor()

param_grid = {'C': np.arange(1, 5), 'penalty': ['l1', 'l2']}

search = GridSearchCV(logistic_reg, param_grid, n_jobs=-1, cv=5, refit=True, scoring='accuracy')
search.fit(X_train, y_train)

In [199]:
boost_reg = ensemble.GradientBoostingRegressor()
boost_reg.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [200]:
boost_reg.score(X_test, y_test)

0.042049635747543834

---
Logistic Regression

In [215]:
from sklearn.linear_model import LogisticRegression

In [216]:
logistic_reg = LogisticRegression(solver='saga', multi_class='multinomial')

param_grid = {'C': np.arange(1, 5), 'penalty': ['l1', 'l2']}

search = GridSearchCV(logistic_reg, param_grid, n_jobs=-1, cv=5, refit=True, scoring='accuracy')
search.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1, 2, 3, 4]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [217]:
search.best_params_

{'C': 1, 'penalty': 'l2'}

In [205]:
accuracy_score(y_train, search.best_estimator_.predict(X_train))

0.9170731707317074

In [218]:
logistic_reg = LogisticRegression(C=1, penalty='l2', solver='saga', multi_class='multinomial')
logistic_reg.fit(X_train, y_train)
pred_prob = logistic_reg.predict_proba(X_test)



In [219]:
y_pred = logistic_reg.predict(X_target)

In [220]:
answer_to_file(y_pred, y_target['Id'].values, 'logistic_regression.csv')

In [5]:
# есть подозрение, что нужно пронормировать данные, но это не точно

---

In [3]:
# делаем one hot encoding, не знаю, насколько это здесь нужно

In [7]:
for person in persons:
    df_data[person] = pd.Series(data=(df_data['person_id'] == person).astype('float'), index=df_data.index)

features

In [8]:
X_labels = np.delete(df_data.columns.values, [0, 1])

In [9]:
df_X = df_data[df_data['person_id'].isin(known_persons)]
X = df_X[X_labels].values
X[np.isnan(X)] = 0

In [10]:
df_X_target = df_data[df_data['person_id'].isin(target_persons)]
X_target = df_X_target[X_labels].values
X_target[np.isnan(X_target)] = 0

answers

In [11]:
def get_prediction(person, date):
    
    df = df_predictions[df_predictions['person_id'] == person]
    
    for idx in range(df.shape[0]):
        start_date = end_date if (idx > 0) else (df.iloc[idx]['date'] - pd.to_timedelta('7 days'))
        end_date = df.iloc[idx]['date']
        mask = (date > start_date) & (date <= end_date)
        if mask:
            return df.iloc[idx]['Prediction1'] 
    return 0

In [12]:
y = np.zeros(X.shape[0])
y_target = np.zeros(X_target.shape[0])

In [13]:
for idx in range(X.shape[0]):
    y[idx] = get_prediction(df_X.iloc[idx]['person_id'], df_X.iloc[idx]['date'])
    #print('person = {}, date = {}, val = {}'.format(df_X.iloc[idx]['person_id'], df_X.iloc[idx]['date'], y[idx]))

---

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Logistic Regression

In [16]:
clf = LogisticRegression(solver='saga', max_iter=200, multi_class='multinomial')

param_grid = {'C': np.arange(1, 5), 'penalty': ['l1', 'l2']}

search = GridSearchCV(clf, param_grid, n_jobs=-1, cv=5, refit=True, scoring='accuracy')
search.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1, 2, 3, 4]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [17]:
search.best_params_

{'C': 3, 'penalty': 'l2'}

In [18]:
accuracy_score(y_train, search.best_estimator_.predict(X_train))

0.5635245901639344

In [24]:
optimal_clf = LogisticRegression(C=3, penalty='l2', solver='saga', max_iter=200, multi_class='multinomial')
optimal_clf.fit(X_train, y_train)
pred_prob = optimal_clf.predict_proba(X_test)



In [25]:
pred_prob

array([[1.82303191e-03, 3.93654722e-02, 5.55126205e-02, 8.20919688e-01,
        8.01838760e-02, 2.19531131e-03],
       [4.78618194e-03, 8.15590737e-02, 3.25747731e-02, 2.02872916e-02,
        7.94969185e-01, 6.58234950e-02],
       [2.73111315e-04, 8.71674652e-03, 7.01162711e-01, 4.12115116e-02,
        2.46791657e-01, 1.84426270e-03],
       ...,
       [1.28463536e-01, 2.54449341e-02, 1.27131907e-01, 6.32734254e-01,
        5.36400920e-02, 3.25852765e-02],
       [4.08065181e-03, 1.30562908e-02, 6.02840759e-01, 2.14474663e-01,
        1.64015926e-01, 1.53170940e-03],
       [3.79482808e-02, 3.37482089e-01, 1.89276665e-01, 2.04731216e-01,
        1.96347207e-01, 3.42145424e-02]])

---

K-Neighbors Classifier

In [20]:
clf = KNeighborsClassifier()

param_grid = {'weights': ['uniform', 'distance'], 'n_neighbors': np.arange(10, 20), 'metric': ['manhattan', 'euclidean']}

search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)
search.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   52.4s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]), 'metric': ['manhattan', 'euclidean']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [21]:
search.best_params_

{'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'distance'}

In [22]:
optimal_clf = KNeighborsClassifier(n_neighbors=13, metric='manhattan', weights='distance')
optimal_clf.fit(X_train, y_train)
pred_prob = optimal_clf.predict_proba(X_test)

In [23]:
pred_prob

array([[0.        , 0.        , 0.        , 0.92900969, 0.07099031,
        0.        ],
       [0.07118421, 0.0733502 , 0.        , 0.        , 0.71389433,
        0.14157125],
       [0.        , 0.        , 0.76070257, 0.15971696, 0.07958047,
        0.        ],
       ...,
       [0.15659164, 0.07815148, 0.15334241, 0.53662788, 0.07528658,
        0.        ],
       [0.        , 0.        , 0.46211895, 0.30677005, 0.23111101,
        0.        ],
       [0.159373  , 0.07960921, 0.15239564, 0.3808546 , 0.07592782,
        0.15183974]])

---