In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pprint import pprint

import sys
import warnings
warnings.filterwarnings('ignore')
import tqdm
from tqdm import tqdm

from sklearn.metrics import mean_absolute_error, mean_squared_error

import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from scipy.optimize import minimize

import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('train.csv.zip', parse_dates=['DATE'])

## Данные

In [3]:
from functions import GenerateFis
from score_submission import scorer, score_series
from functions.date_split import split_month_test

In [4]:
dfs = GenerateFis.GenerateFis(data)

In [7]:
atm0 = dfs[0]

In [11]:
train_df, test_df = split_month_test(atm0, 30, 100) # 30 days test, 100 days train

In [13]:
X_train, y_train = train_df[atm0.columns.difference(['DATE', 'CLIENT_OUT'])], train_df.CLIENT_OUT
X_test, y_test = test_df[atm0.columns.difference(['DATE', 'CLIENT_OUT'])], test_df.CLIENT_OUT

## Предсказание

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

In [8]:
# Линейная регрессия с Lasso и Ridge регуляризацией

In [21]:
alphas = [1e-8, 1e-5, 1e-4, 1e-3, 1e-2, 1]
metrics = {}
for alpha in tqdm(alphas):
    model = Lasso(alpha=alpha, normalize=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pred_df = test_df.copy()
    pred_df.CLIENT_OUT = y_pred
    metrics['Lasso with alpha = {}'.format(alpha)] = scorer(test_df, pred_df)

100%|██████████| 6/6 [00:19<00:00,  3.17s/it]


In [22]:
min(metrics), metrics[min(metrics)]

('Lasso with alpha = 0.0001', 5206.638936451033)

In [25]:
for alpha in alphas:
    model = Ridge(alpha=alpha, normalize=True)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    pred_df = test_df.copy()
    pred_df.CLIENT_OUT = y_pred
    metrics['Ridge with alpha = {}'.format(alpha)] = scorer(test_df, pred_df)

In [28]:
min(metrics), metrics[min(metrics)]

('Lasso with alpha = 0.0001', 5206.638936451033)

In [169]:
# Решающий лес

In [19]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [20]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = 40)
rf_random.fit(X_train, y_train)
print(rf_random.best_params_)
rf_clf = rf_random.best_estimator_

TypeError: __init__() got an unexpected keyword argument 'metrics'

In [37]:
rf_random.best_score_

0.6566244016726913

In [42]:
model = rf_clf
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_df = test_df.copy()
pred_df.CLIENT_OUT = y_pred
scorer(test_df, pred_df)

0.37716315878868484

In [47]:
with open("rf_clf.pickle", 'wb') as rf_file:
    pickle.dump(rf_clf, rf_file)

In [38]:
import xgboost as xgb

parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': range(3,10,2),
              'min_child_weight': range(1,6,2),
              'silent': [1],
              'subsample': [i/10.0 for i in range(6,10)],
              'colsample_bytree': [i/10.0 for i in range(6,10)],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 3,
                        n_jobs = 40,
                        verbose=True)

xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
xgb_clf = xgb_grid.best_estimator_

ModuleNotFoundError: No module named 'xgboost'

In [None]:
model = xgb_clf
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_df = test_df.copy()
pred_df.CLIENT_OUT = y_pred
scorer(test_df, pred_df)

In [163]:
def prediction(train, test):
    X_train, y_train, X_test = train.loc[:, train.columns != 'ATM_ID'], train['ATM_ID'], test
    model.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    return y_pred