In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pprint import pprint

import sys
import warnings
warnings.filterwarnings('ignore')
import tqdm
from tqdm import tqdm

from sklearn.metrics import mean_absolute_error, mean_squared_error

import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from scipy.optimize import minimize

import matplotlib.pyplot as plt
import pickle

In [2]:
data = pd.read_csv('train.csv.zip', parse_dates=['DATE'])

## Данные

In [13]:
from functions import GenerateFis
from score_submission import scorer, score_series
from functions.date_split import split_month_test

In [14]:
dfs = GenerateFis.GenerateFis(data)
atm0 = dfs[0]

In [15]:
train_df, test_df = split_month_test(atm0, 30, 100) # 30 days test, 100 days train

In [16]:
X_train, y_train = train_df[atm0.columns.difference(['DATE', 'CLIENT_OUT'])], train_df.CLIENT_OUT
X_test, y_test = test_df[atm0.columns.difference(['DATE', 'CLIENT_OUT'])], test_df.CLIENT_OUT

## Предсказание

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

In [8]:
# Линейная регрессия с Lasso и Ridge регуляризацией

In [24]:
model = ElasticNet(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_df = test_df.copy()
pred_df.CLIENT_OUT = y_pred
scorer(test_df, pred_df)

5628.981880119268

In [23]:
model = SGDRegressor(random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_df = test_df.copy()
pred_df.CLIENT_OUT = y_pred
scorer(test_df, pred_df)

0.0

In [None]:
with open("sgd.pickle", 'wb') as rf_file:
    pickle.dump(model, rf_file)

In [21]:
with open("elastic.pickle", 'wb') as rf_file:
    pickle.dump(model, rf_file)

In [9]:
alphas = [1e-8, 1e-5, 1e-4, 1e-3, 1e-2, 1]
metrics = {}
for alpha in tqdm(alphas):
    model = Lasso(alpha=alpha, normalize=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pred_df = test_df.copy()
    pred_df.CLIENT_OUT = y_pred
    metrics['Lasso with alpha = {}'.format(alpha)] = scorer(test_df, pred_df)

100%|██████████| 6/6 [00:18<00:00,  3.14s/it]


In [10]:
min(metrics), metrics[min(metrics)]

('Lasso with alpha = 0.0001', 5514.556374725053)

In [11]:
model = Lasso(alpha=0.0001, normalize=True)
model.fit(X_train, y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [12]:
with open("lasso_n.pickle", 'wb') as rf_file:
    pickle.dump(model, rf_file)

In [None]:
for alpha in alphas:
    model = Ridge(alpha=alpha, normalize=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pred_df = test_df.copy()
    pred_df.CLIENT_OUT = y_pred
    metrics['Ridge with alpha = {}'.format(alpha)] = scorer(test_df, pred_df)

In [None]:
min(metrics), metrics[min(metrics)]

In [None]:
# Решающий лес

In [None]:
"""
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 400, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 60, num = 4)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
"""

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 50, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = 40)
rf_random.fit(X_train, y_train)
print(rf_random.best_params_)
rf_clf = rf_random.best_estimator_

In [None]:
rf_random.best_score_

In [None]:
rf_clf.fit(X_train, y_train)

```
rf_random.best_params_:
{'bootstrap': False,
 'max_depth': 60,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 400}
 ```

In [None]:
with open("rf_rcv.pickle", 'wb') as rf_file:
    pickle.dump(rf_clf, rf_file)

In [None]:
model = rf_clf
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_df = test_df.copy()
pred_df.CLIENT_OUT = y_pred
scorer(test_df, pred_df)

In [None]:
with open("rf_clf.pickle", 'wb') as rf_file:
    pickle.dump(rf_clf, rf_file)

In [None]:
import xgboost as xgb

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': range(3,10,2),
              'min_child_weight': range(1,6,2),
              'silent': [1],
              'subsample': [i/10.0 for i in range(6,10)],
              'colsample_bytree': [i/10.0 for i in range(6,10)],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 3,
                        n_jobs = 40,
                        verbose=True)

xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
xgb_clf = xgb_grid.best_estimator_

In [None]:
model = xgb_clf
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_df = test_df.copy()
pred_df.CLIENT_OUT = y_pred
scorer(test_df, pred_df)

In [None]:
def prediction(train, test):
    X_train, y_train, X_test = train.loc[:, train.columns != 'ATM_ID'], train['ATM_ID'], test
    model.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    return y_pred