In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import scipy.stats as stats
import seaborn as sns

from pathlib import Path
from IPython.display import Markdown, display

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

from data_process_tourniquet import DataTransform, PREDICTIONS_DIR
from df_addons import memory_compression
from works_weekends import WorkHolidays

pd.options.display.max_columns = None

params = {
          'axes.titlesize': 15,   # Размер шрифта главной подписи
          'axes.labelsize': 14,   # Размер шрифта подписей осей
          'xtick.labelsize': 12,  # Размер шрифта подписей тикетов оси X
          'ytick.labelsize': 12,  # Размер шрифта подписей тикетов оси Y

          }

plt.rcParams.update(params)

__import__('warnings').filterwarnings("ignore")

In [2]:
file_dir = Path(r'D:\python-txt\tourniquet')
file_train = file_dir.joinpath('train.csv')

all_df = pd.read_csv(file_train, parse_dates=['timestamp'], index_col='row_id')

model_columns = ['gate_id', 'hour', 'min', 'day', 'month', 'weekday']
cat_columns = []

data_cls = DataTransform(category_columns=cat_columns)
data_cls.exclude_columns = []

all_df = data_cls.preprocess(all_df, model_columns=model_columns)

train_df = all_df[all_df.user_id > -1]
test_df = all_df[all_df.user_id < 0]

Исходный размер датасета в памяти равен 2.29 мб.
Конечный размер датасета в памяти равен 0.54 мб.
Экономия памяти = 76.6%


In [3]:
train_df

Unnamed: 0_level_0,user_id,gate_id,hour,min,day,month,weekday
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,18,7,9,8,29,7,4
1,18,9,9,9,29,7,4
2,18,9,9,9,29,7,4
3,18,5,9,10,29,7,4
4,18,5,9,10,29,7,4
...,...,...,...,...,...,...,...
37513,6,11,20,38,31,12,5
37514,6,6,20,39,31,12,5
37515,6,6,20,39,31,12,5
37516,6,9,20,39,31,12,5


### Валидация с 2022-11-01 (включительно) по конец обучающей выборки.

### Таргет - user_id.

### Оценка качества - процент правильных ответов (accuracy * 100).

### Используйте последнюю версию sklearn.

Q1. Постройте логистическую регрессию, оптимизатор liblinear, остальные параметры дефолтные, на 6 признаках: ['gate_id', 'hour', 'min', 'day', 'month','dayofweek']. Признаки делаем из "ts", в частности "min" - это минуты из времени. Используем масштабирование StandardScaler.

  
Каково будет качество на обучающей выборке и на валидации (округлите до целых и выберите ближайший ответ). 10 баллов.

In [4]:
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

model = LogisticRegression(solver="liblinear").fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))
print(f'train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')

train_score = 9.6668406%, valid_score = 4.7828529%


Q2: Постройте логистическую регрессию на дефолтных параметрах, но random_state=1, solver=saga, и 13 признаках:
['gate_id', 'hour', 'min', 'day', 'month', 'dayofweek','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'].  Используем масштабирование StandardScaler.
Отметьте все правильные ответы. 10 баллов.
- Качество на валидации будет лучше, чем у регрессии в Q1 - НЕТ
- Регрессия без признака 'Sunday' будет хуже на валидационной выборке - ДА
- Регуляризация "elasticnet 0.5" хуже дефолтной - НЕТ

In [5]:
all_df = pd.read_csv(file_train, parse_dates=['timestamp'], index_col='row_id')

model_columns = ['gate_id', 'hour', 'min', 'day', 'month', 'dayofweek', 'weekday']
cat_columns = ['dayofweek']

data_cls = DataTransform(category_columns=cat_columns)
data_cls.exclude_columns = []

all_df = data_cls.preprocess(all_df, model_columns=model_columns)

train_df = all_df[all_df.user_id > -1]
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

Трансформирую колонку: dayofweek
Исходный размер датасета в памяти равен 2.54 мб.
Конечный размер датасета в памяти равен 0.79 мб.
Экономия памяти = 69.0%


In [6]:
train

Unnamed: 0_level_0,gate_id,hour,min,day,month,weekday,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7,9,8,29,7,4,0,0,0,0,1,0,0
1,9,9,9,29,7,4,0,0,0,0,1,0,0
2,9,9,9,29,7,4,0,0,0,0,1,0,0
3,5,9,10,29,7,4,0,0,0,0,1,0,0
4,5,9,10,29,7,4,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37513,11,20,38,31,12,5,0,0,0,0,0,1,0
37514,6,20,39,31,12,5,0,0,0,0,0,1,0
37515,6,20,39,31,12,5,0,0,0,0,0,1,0
37516,9,20,39,31,12,5,0,0,0,0,0,1,0


In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

model = LogisticRegression(solver="saga", random_state=1).fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))
print('Q1. train_score = 9.6668406%, valid_score = 4.7828529%')
print(f'Q2.1 train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')

# Q1. train_score = 9.6668406%, valid_score = 4.7828529%

Q2.1 train_score = 10.5239621%, valid_score = 4.6892550%


In [8]:
train = train_df.drop(['user_id', 'dayofweek_7'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

model = LogisticRegression(solver="saga", random_state=1).fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))
print(f'Q2.2 train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')

Q2.2 train_score = 10.5239621%, valid_score = 4.6798952%


In [9]:
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

model = LogisticRegression(solver="saga", random_state=1,
                           penalty='elasticnet', l1_ratio=0.5).fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))
print(f'Q2.3 train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')
# Q2.1 train_score = 10.5239621%, valid_score = 4.6892550%

Q2.3 train_score = 10.5425952%, valid_score = 4.6892550%


Q3. Отметьте все правильные ответы для регрессии из Q2. 10 баллов
- Регуляризация "l1" помогает при некоторых оптимизаторах - НЕТ
- "Multinomial" подход не хуже, а часто лучше "One-vs-rest" - ДА
- Признак 'dayofweek' не улучшает качество модели - ДА

In [10]:
# Q3.1
for solver in ('liblinear', 'saga'):
    for penalty in ('l1', 'l2'):
        model = LogisticRegression(solver=solver, random_state=1,
                                   penalty=penalty).fit(X_train, y_train)

        train_score = accuracy_score(y_train, model.predict(X_train))
        valid_score = accuracy_score(y_valid, model.predict(X_valid))
        print(f'solver: {solver} penalty: {penalty} '
              f'train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')

solver: liblinear penalty: l1 train_score = 10.5016024%, valid_score = 4.6237364%
solver: liblinear penalty: l2 train_score = 10.4307968%, valid_score = 4.7454137%
solver: saga penalty: l1 train_score = 10.5388686%, valid_score = 4.6330962%
solver: saga penalty: l2 train_score = 10.5239621%, valid_score = 4.6892550%


In [None]:
# Q3.2
for multi_class in ('multinomial', 'ovr'):
    model = LogisticRegression(solver='saga', random_state=1,
                               multi_class=multi_class).fit(X_train, y_train)

    train_score = accuracy_score(y_train, model.predict(X_train))
    valid_score = accuracy_score(y_valid, model.predict(X_valid))
    print(f'Q3.2 multi_class: {multi_class} '
          f'train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')

In [11]:
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

model = LogisticRegression(solver="saga", random_state=1).fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))
print(f'Q3.3   c weekday train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')

train = train_df.drop(['user_id', 'weekday'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

model = LogisticRegression(solver="saga", random_state=1).fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))
print(f'Q3.3 без weekday train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')

Q3.3   c weekday train_score = 10.5239621%, valid_score = 4.6892550%
Q3.3 без weekday train_score = 10.5239621%, valid_score = 4.6892550%


In [12]:
train

Unnamed: 0_level_0,gate_id,hour,min,day,month,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,dayofweek_7
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,7,9,8,29,7,0,0,0,0,1,0,0
1,9,9,9,29,7,0,0,0,0,1,0,0
2,9,9,9,29,7,0,0,0,0,1,0,0
3,5,9,10,29,7,0,0,0,0,1,0,0
4,5,9,10,29,7,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
37513,11,20,38,31,12,0,0,0,0,0,1,0
37514,6,20,39,31,12,0,0,0,0,0,1,0
37515,6,20,39,31,12,0,0,0,0,0,1,0
37516,9,20,39,31,12,0,0,0,0,0,1,0


Q4. Постройте логистическую регрессию на дефолтных параметрах и 33 признаках:
['gate_-1', 'gate_0', 'gate_1', 'gate_3', 'gate_4', 'gate_5', 'gate_6', 'gate_7', 'gate_8', 'gate_9', 'gate_10', 'gate_11', 'gate_12', 'gate_13', 'gate_14', 'gate_15', 'gate_16', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'hour', 'min', 'day'].
Используем масштабирование StandardScaler.
Отметьте все правильные ответы. 10 баллов.
- В валидационной выборке больше 700 истинных предсказаний - ДА
- Оптимизатор saga дает такое же качество как lbfgs - НЕТ
- user1 чаще всего приходит в четверг - ДА

In [13]:
all_df = pd.read_csv(file_train, parse_dates=['timestamp'], index_col='row_id')

model_columns = ['gate_id', 'hour', 'min', 'day', 'month', 'dayofweek']
cat_columns = ['gate_id', 'dayofweek', 'month']

data_cls = DataTransform(category_columns=cat_columns)
data_cls.exclude_columns = []

all_df = data_cls.preprocess(all_df, model_columns=model_columns)

print(all_df.columns.to_list())

train_df = all_df[all_df.user_id > -1]
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

valid = X_valid.copy()
valid['user_id'] = y_valid

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

# Q4.1
model = LogisticRegression().fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))
print(f'Q4.1 кол-во истинных предсказаний = {sum(y_valid == model.predict(X_valid))}')

Трансформирую колонку: gate_id
Трансформирую колонку: dayofweek
Трансформирую колонку: month
Исходный размер датасета в памяти равен 2.5 мб.
Конечный размер датасета в памяти равен 1.5 мб.
Экономия памяти = 40.0%
['user_id', 'hour', 'min', 'day', 'gate_id_-1', 'gate_id_0', 'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5', 'gate_id_6', 'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10', 'gate_id_11', 'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15', 'gate_id_16', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'dayofweek_7', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']
Q4.1 кол-во истинных предсказаний = 791


In [14]:
# Q4.2
for solver in ('lbfgs', 'saga'):
    model = LogisticRegression(solver=solver).fit(X_train, y_train)

    train_score = accuracy_score(y_train, model.predict(X_train))
    valid_score = accuracy_score(y_valid, model.predict(X_valid))
    print(f'solver: {solver} '
          f'train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')
    
    valid['id_pred'] = model.predict(X_valid)

solver: saga train_score = 14.4518149%, valid_score = 7.3661550%
solver: lbfgs train_score = 14.6418723%, valid_score = 7.4035942%


In [15]:
for col in ('dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'dayofweek_7'):
    print(valid[(valid.id_pred == 1) & (valid[col] == 1)].groupby(col).day.count())

dayofweek_1
1    29
Name: day, dtype: int64
dayofweek_2
1    15
Name: day, dtype: int64
dayofweek_3
1    14
Name: day, dtype: int64
dayofweek_4
1    33
Name: day, dtype: int64
dayofweek_5
1    31
Name: day, dtype: int64
Series([], Name: day, dtype: int64)
Series([], Name: day, dtype: int64)


In [16]:
# Q4.3
all_df = pd.read_csv(file_train, parse_dates=['timestamp'], index_col='row_id')

model_columns = ['gate_id', 'date', 'hour', 'min', 'day', 'month', 'dayofweek']
cat_columns = []

data_cls = DataTransform(category_columns=cat_columns)
data_cls.exclude_columns = []

all_df = data_cls.preprocess(all_df, model_columns=model_columns)
print(all_df.columns.to_list())

train_df = all_df[all_df.user_id > -1]
print(train_df[train_df.user_id == 1].groupby('dayofweek').gate_id.count())
df = train_df[['user_id', 'date', 'dayofweek']].drop_duplicates()
print(df[df.user_id == 1].groupby('dayofweek').date.count())

Исходный размер датасета в памяти равен 2.58 мб.
Конечный размер датасета в памяти равен 0.61 мб.
Экономия памяти = 76.2%
['user_id', 'gate_id', 'date', 'hour', 'min', 'day', 'month', 'dayofweek']
dayofweek
1    228
2    237
3    226
4    285
5    260
6     63
Name: gate_id, dtype: int64
dayofweek
1    19
2    18
3    18
4    19
5    19
6     5
Name: date, dtype: int64


In [17]:
# Для регрессии из Q4 с оптимизатором saga при random_state=1 отметьте все правильные ответы.
# - по сравнению с другими юзерами, user1 любит приходить в субботу и не любит приходить в
#   воскресенье - НЕТ, но можно сказать "ДА" на трейне, но на валидации его нет в субботу
# - больше всего заходов предсказали у user55 - ДА если считать правильно предсказали
# - среди всех предсказанных хуже всего предсказали user39, если оценивать по доле правильно
#   предсказанных заходов - ДА

# Q5.1
all_df = pd.read_csv(file_train, parse_dates=['timestamp'], index_col='row_id')

model_columns = ['gate_id', 'hour', 'min', 'day', 'month', 'dayofweek']
cat_columns = ['gate_id', 'dayofweek', 'month']

data_cls = DataTransform(category_columns=cat_columns)
data_cls.exclude_columns = []

all_df = data_cls.preprocess(all_df, model_columns=model_columns)

print(len(all_df.columns) - 1, all_df.columns.to_list())

train_df = all_df[all_df.user_id > -1]
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

valid = X_valid.copy()
valid['user_id'] = y_valid

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

# Q5.1
model = LogisticRegression(solver='saga', random_state=1).fit(X_train, y_train)

train_score = accuracy_score(y_train, model.predict(X_train))
valid_score = accuracy_score(y_valid, model.predict(X_valid))
print(f'Q train_score = {train_score:.7%}, valid_score = {valid_score:.7%}')
valid['id_pred'] = model.predict(X_valid)

Трансформирую колонку: gate_id
Трансформирую колонку: dayofweek
Трансформирую колонку: month
Исходный размер датасета в памяти равен 2.5 мб.
Конечный размер датасета в памяти равен 1.5 мб.
Экономия памяти = 40.0%
33 ['user_id', 'hour', 'min', 'day', 'gate_id_-1', 'gate_id_0', 'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5', 'gate_id_6', 'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10', 'gate_id_11', 'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15', 'gate_id_16', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'dayofweek_7', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12']
Q train_score = 14.4518149%, valid_score = 7.3661550%


In [18]:
grp = valid[valid.dayofweek_6 == 1].groupby(['dayofweek_6', 'user_id'], as_index=False).agg(counts=('day', 'count'))
grp.sort_values('counts', ascending=False)

Unnamed: 0,dayofweek_6,user_id,counts
2,1,6,62
1,1,1,52
11,1,29,34
16,1,39,30
20,1,50,29
14,1,35,27
17,1,47,25
15,1,37,25
19,1,49,21
13,1,33,18


In [19]:
grp = valid[valid.dayofweek_7 == 0].groupby(['dayofweek_7', 'user_id'], as_index=False).agg(counts=('day', 'count'))
grp.sort_values('counts', ascending=False)

Unnamed: 0,dayofweek_7,user_id,counts
45,0,55,796
31,0,37,578
1,0,1,557
5,0,6,527
10,0,12,519
29,0,35,513
13,0,17,482
27,0,33,481
0,0,0,456
26,0,32,430


In [20]:
grp = valid[valid.dayofweek_6 == 1].groupby(['dayofweek_6', 'id_pred'], as_index=False).agg(counts=('day', 'count'))
grp.sort_values('counts', ascending=False)

Unnamed: 0,dayofweek_6,id_pred,counts
0,1,18,271
7,1,49,54
1,1,19,53
4,1,32,37
6,1,37,20
5,1,33,12
2,1,25,2
8,1,50,2
3,1,29,1


In [21]:
grp = valid[valid.dayofweek_7 == 0].groupby(['dayofweek_7', 'id_pred'], as_index=False).agg(counts=('day', 'count'))
grp.sort_values('counts', ascending=False)

Unnamed: 0,dayofweek_7,id_pred,counts
19,0,37,2113
5,0,12,1942
7,0,15,1872
26,0,55,1808
13,0,27,478
8,0,18,435
22,0,47,410
25,0,53,339
9,0,19,296
23,0,49,264


In [22]:
# Q5.2 и Q5.3 
grp = valid[valid.user_id == valid.id_pred].groupby(['id_pred'], as_index=False).agg(counts=('day', 'count'))
grp.sort_values('counts', ascending=False)

Unnamed: 0,id_pred,counts
15,55,246
11,37,162
3,12,120
4,15,117
1,1,35
14,49,35
6,19,24
2,3,16
10,33,8
8,26,6


In [23]:
%%time

# Q6. Постройте логистическую регрессию на дефолтных параметрах и 50 признаках:
# ['gate_-1', 'gate_0', 'gate_1', 'gate_3', 'gate_4', 'gate_5', 'gate_6', 'gate_7', 'gate_8',
# 'gate_9', 'gate_10', 'gate_11', 'gate_12', 'gate_13', 'gate_14', 'gate_15', 'gate_16',
# 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'month_7',
# 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'hour_6', 'hour_7', 'hour_8',
# 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
# 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'min', 'day'].
# Используем масштабирование StandardScaler.
# Самый точный оптимизатор-решатель (при max_iter=100, random_state=1). 10 баллов - 'saga'

all_df = pd.read_csv(file_train, parse_dates=['timestamp'], index_col='row_id')

model_columns = ['gate_id', 'dayofweek', 'month', 'hour', 'min', 'day']
cat_columns = ['gate_id', 'dayofweek', 'month', 'hour']

data_cls = DataTransform(category_columns=cat_columns)
data_cls.exclude_columns = []

all_df = data_cls.preprocess(all_df, model_columns=model_columns)

print(len(all_df.columns) - 1, all_df.columns.to_list())

train_df = all_df[all_df.user_id > -1]
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

# Q6
result = []
for solver in ('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'):
    try:
        model = LogisticRegression(solver=solver,
                                   max_iter=100,
                                   random_state=1).fit(X_train, y_train)
        train_scor = accuracy_score(y_train, model.predict(X_train))
        valid_scor = accuracy_score(y_valid, model.predict(X_valid))
        print(f'solver: {solver} train_score = {train_scor:.7%}, valid_score = {valid_scor:.7%}')
        result.append((solver, train_scor, valid_scor))
    except:
        print(f"Нет такого оптимизитора: {solver}")

print(*sorted(result, key=lambda x: x[-1], reverse=True), sep='\n')

Трансформирую колонку: gate_id
Трансформирую колонку: dayofweek
Трансформирую колонку: month
Трансформирую колонку: hour
Исходный размер датасета в памяти равен 2.86 мб.
Конечный размер датасета в памяти равен 2.11 мб.
Экономия памяти = 26.2%
50 ['user_id', 'min', 'day', 'gate_id_-1', 'gate_id_0', 'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5', 'gate_id_6', 'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10', 'gate_id_11', 'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15', 'gate_id_16', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'dayofweek_7', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23']
solver: lbfgs train_score = 17.6902437%, valid_score = 9.3036316%
solver: liblinear train_score = 17.6119848%, valid_score = 9.4814

In [24]:
# Q7. Отметьте все правильные ответы для регрессии из Q6 и оптимизатора saga. 10 баллов
# - Регрессия переобучилась по сравнению с регрессией из Q4 - НЕТ
# - random_state=2 лучше, чем random_state=1 для качества на обучающей выборке - ДА
# - Качество стало лучше, чем у регрессии в Q4 - ДА

all_df = pd.read_csv(file_train, parse_dates=['timestamp'], index_col='row_id')

model_columns = ['gate_id', 'dayofweek', 'month', 'hour', 'min', 'day']
cat_columns = ['gate_id', 'dayofweek', 'month', 'hour']

data_cls = DataTransform(category_columns=cat_columns)
data_cls.exclude_columns = []

all_df = data_cls.preprocess(all_df, model_columns=model_columns)

print(len(all_df.columns) - 1, all_df.columns.to_list())

train_df = all_df[all_df.user_id > -1]
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

print('Q4: solver: saga train_score = 14.4518149%, valid_score = 7.3661550%')

for seed in (1, 2):
    model = LogisticRegression(solver='saga',
                               max_iter=100,
                               random_state=seed).fit(X_train, y_train)
    train_scor = accuracy_score(y_train, model.predict(X_train))
    valid_scor = accuracy_score(y_valid, model.predict(X_valid))
    print(f'Q7. seed: {seed} train_score = {train_scor:.7%}, valid_score = {valid_scor:.7%}')

Трансформирую колонку: gate_id
Трансформирую колонку: dayofweek
Трансформирую колонку: month
Трансформирую колонку: hour
Исходный размер датасета в памяти равен 2.86 мб.
Конечный размер датасета в памяти равен 2.11 мб.
Экономия памяти = 26.2%
50 ['user_id', 'min', 'day', 'gate_id_-1', 'gate_id_0', 'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5', 'gate_id_6', 'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10', 'gate_id_11', 'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15', 'gate_id_16', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'dayofweek_7', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23']
Q4: solver: saga train_score = 14.4518149%, valid_score = 7.3661550%
Q7. seed: 1 train_score = 17.4107476%, valid_score = 9.5376264

In [25]:
# Q8. Отметьте все правильные ответы для регрессии из Q6 и оптимизатора saga. 10 баллов
# - Качество будет лучше на обучающей выборке, если уменьшить параметр C до 0.6 - НЕТ
# - class_weight='balanced' улучшает качество на обучающей выборке - НЕТ
# - max_iter=200 ухудшает качество на валидационной выборке - ДА
# - max_iter=80 ухудшает качество на обучающей выборке - НЕТ

all_df = pd.read_csv(file_train, parse_dates=['timestamp'], index_col='row_id')

model_columns = ['gate_id', 'dayofweek', 'month', 'hour', 'min', 'day']
cat_columns = ['gate_id', 'dayofweek', 'month', 'hour']

data_cls = DataTransform(category_columns=cat_columns)
data_cls.exclude_columns = []

all_df = data_cls.preprocess(all_df, model_columns=model_columns)

print(len(all_df.columns) - 1, all_df.columns.to_list())

train_df = all_df[all_df.user_id > -1]
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train, target)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

for C_ in (1.0, 0.6):
    model = LogisticRegression(solver='saga',
                               C=C_,
                               max_iter=100,
                               random_state=1).fit(X_train, y_train)
    train_scor = accuracy_score(y_train, model.predict(X_train))
    valid_scor = accuracy_score(y_valid, model.predict(X_valid))
    print(f'Q8.1 C: {C_} train_score = {train_scor:.7%}, valid_score = {valid_scor:.7%}')

for class_weight in (None, 'balanced'):    
    model = LogisticRegression(solver='saga',
                               class_weight=class_weight,
                               max_iter=100,
                               random_state=1).fit(X_train, y_train)
    train_scor = accuracy_score(y_train, model.predict(X_train))
    valid_scor = accuracy_score(y_valid, model.predict(X_valid))
    print(f'Q8.2 class_weight: {class_weight} train_score = {train_scor:.7%}, valid_score = {valid_scor:.7%}')

for max_iter in (200, 80):
    model = LogisticRegression(solver='saga',
                               max_iter=max_iter,
                               random_state=1).fit(X_train, y_train)
    train_scor = accuracy_score(y_train, model.predict(X_train))
    valid_scor = accuracy_score(y_valid, model.predict(X_valid))
    print(f'Q8.3 max_iter={max_iter} '
          f'train_score = {train_scor:.7%}, valid_score = {valid_scor:.7%}')

Трансформирую колонку: gate_id
Трансформирую колонку: dayofweek
Трансформирую колонку: month
Трансформирую колонку: hour
Исходный размер датасета в памяти равен 2.86 мб.
Конечный размер датасета в памяти равен 2.11 мб.
Экономия памяти = 26.2%
50 ['user_id', 'min', 'day', 'gate_id_-1', 'gate_id_0', 'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5', 'gate_id_6', 'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10', 'gate_id_11', 'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15', 'gate_id_16', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'dayofweek_7', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23']
Q8.1 C: 1.0 train_score = 17.4107476%, valid_score = 9.5376264%
Q8.1 C: 0.6 train_score = 17.4032943%, valid_score = 9.5376264%
Q8.

Q9. Улучшится ли качество модели на валидационной выборке, если в качестве признаков использовать предыдущее наблюдение?
Оцените для регрессии из Q6 и оптимизатора saga (random_state = 1). 10 баллов

- Да, т.к. есть зависимость от предыдущего турникета - *ДА*
- Нет, т.к. модель переобучилась
- Нет, качество такое же
- Не понятно

In [26]:
# Q10. Проверьте, помогает ли качеству регрессии из Q6 и оптимизатора saga больше данных,
# оцените разницу между 60%, 80% и 100% от train. 10 баллов
# - Помогает, и на обучающей и на валидации
# - Помогает
# - Не помогает, т.к. получаем почти те же значения
# - Не понятно

all_df = pd.read_csv(file_train, parse_dates=['timestamp'], index_col='row_id')

model_columns = ['gate_id', 'dayofweek', 'month', 'hour', 'min', 'day']
cat_columns = ['gate_id', 'dayofweek', 'month', 'hour']

data_cls = DataTransform(category_columns=cat_columns)
data_cls.exclude_columns = []

all_df = data_cls.preprocess(all_df, model_columns=model_columns)

print(len(all_df.columns) - 1, all_df.columns.to_list())

train_df = all_df[all_df.user_id > -1]
train = train_df.drop(['user_id'], axis=1)
target = train_df['user_id']

for train_size in (0.6, 0.8, 1.0):
    X_train, X_valid, y_train, y_valid = data_cls.train_test_split(train,
                                                                   target,
                                                                   train_size=train_size)
    print(f'Размер X_train = {X_train.shape}, X_valid = {X_valid.shape}')

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)

    model = LogisticRegression(solver='saga',
                               max_iter=100,
                               random_state=1).fit(X_train, y_train)
    train_scor = accuracy_score(y_train, model.predict(X_train))
    valid_scor = accuracy_score(y_valid, model.predict(X_valid))
    print(f'Q10 train_size: {train_size} '
          f'train_score = {train_scor:.7%}, valid_score = {valid_scor:.7%}')

Трансформирую колонку: gate_id
Трансформирую колонку: dayofweek
Трансформирую колонку: month
Трансформирую колонку: hour
Исходный размер датасета в памяти равен 2.86 мб.
Конечный размер датасета в памяти равен 2.11 мб.
Экономия памяти = 26.2%
50 ['user_id', 'min', 'day', 'gate_id_-1', 'gate_id_0', 'gate_id_1', 'gate_id_3', 'gate_id_4', 'gate_id_5', 'gate_id_6', 'gate_id_7', 'gate_id_8', 'gate_id_9', 'gate_id_10', 'gate_id_11', 'gate_id_12', 'gate_id_13', 'gate_id_14', 'gate_id_15', 'gate_id_16', 'dayofweek_1', 'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5', 'dayofweek_6', 'dayofweek_7', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23']
Размер X_train = (22510, 50), X_valid = (15008, 50)
Q10 train_size: 0.6 train_score = 17.9342514%, valid_score = 10.3211620%
Размер