<a href="https://colab.research.google.com/github/sashavorot/multiclass-turnikets/blob/main/turnstiles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost

In [None]:
!pip install workalendar

In [None]:
import pandas as pd
import numpy as np

In [None]:
import zipfile

In [None]:
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import datetime
from datetime import date, timedelta
from workalendar.europe import Russia

In [None]:
from pprint import pprint

In [None]:
df_test_orig = pd.read_csv("test.csv")
df_train_orig = pd.read_csv("train.csv")

df_test = df_test_orig.copy()
df_train = df_train_orig.copy()

In [None]:
if 'Unnamed: 0' in df_test.columns: df_test.drop(columns=['Unnamed: 0'], inplace=True)
if 'Unnamed: 0' in df_train.columns: df_train.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
if 'lost_id' in df_test.columns:
  df_test['user_id'] = df_test['lost_id']
  df_test.drop(columns=['lost_id'], inplace=True)

df_test.info()

In [None]:
def time_div(df):
  if 'ts' not in df.columns: return df

  df['date'] = df['ts'].str[:10]
  df['time'] = df['ts'].str[10:]

  df.drop(columns=['ts'], inplace=True)

  return df

In [None]:
df_test = time_div(df_test)
df_train = time_div(df_train)

In [None]:
def get_weekends(year):
    weekends = set()
    d = date(year, 1, 1)

    while d.year == year:
        if d.weekday() >= 5:
            weekends.add(d.strftime('%Y-%m-%d'))
        d += timedelta(days=1)

    return weekends

**fug** - frequency user-gate

---


связь user_id с его приоритетами в турникетах

In [None]:
def dct_user_group(df):
  dct = {}
  for user_id, group in df.groupby('user_id'):
    dct[user_id] = group['gate_id'].tolist()

  return dct

In [None]:
def train_test_split_unique(df):
  train, test = train_test_split(df, test_size=0.4)

  missing_users = set(df['user_id'].unique()) - set(train['user_id'].unique())

  for user in missing_users:
    train = pd.concat([train, df[df['user_id'] == user]])

  return [train, test]

In [None]:
def fug_sort(dct):
  for key, arr in dct.items():
    dct_freq = {}
    for i in range(len(arr)):
      if arr[i] in dct_freq:
        dct_freq[arr[i]] += 1
      else: dct_freq[arr[i]] = 1

    lst = [key for key, value in sorted(dct_freq.items(), key=lambda item: item[1], reverse=True)]
    dct[key] = lst

  return dct

In [None]:
dct_train_100 = dct_user_group(df_train)
dct_train_100 = fug_sort(dct_train_100)

dct_train_80 = dct_user_group(train)
dct_train_80 = fug_sort(dct_train_80)

In [None]:
def jaccard_distance(set1, set2):
  intersection = len(set1.intersection(set2))
  union = len(set1.union(set2))

  if union > 0 : return 1 - (intersection / union)
  else: return 1

In [None]:
fug_preds = {}
fug_confidences = {}

for user, gates in dct_train_100.items():
    gates_set = set(gates)

    if not gates_set:
        continue

    best_match = None
    best_distance = float("inf")

    for train_user, train_gates in dct_train_80.items():
        train_set = set(train_gates)

        if not train_set:
            continue

        distance = jaccard_distance(gates_set, train_set)

        if distance < best_distance:
            best_distance = distance
            best_match = train_user

    fug_preds[user] = best_match
    fug_confidences[user] = 1 - best_distance

In [None]:
df_train["fug_predict"] = df_train["user_id"].map(fug_preds)
df_train["fug_confidence"] = df_train["user_id"].map(fug_confidences)

In [None]:
acc = accuracy_score(df_train['user_id'], df_train['fug_predict'])
print(acc)

In [None]:
arr = train_test_split_unique(df_train)
train = arr[0]
test = arr[1]

X_train = train.drop(columns=["user_id", "date", "time"])
y_train = train["user_id"]

model = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, loss_function='MultiClass', verbose=100)
model.fit(X_train, y_train)

In [None]:
X_test = test.drop(columns=["user_id", "date", "time"])
y_test = test["user_id"]

y_pred = model.predict(X_test).astype(int)
acc = accuracy_score(y_pred, y_test)

print(acc)

таким образом с добавлением фичи fug точность стала 55%

**us** - user schedule

---

связь user_id с его рабочим расписанием

In [None]:
def dct_user_group_us(df):
  dct = {}
  for user_id, group in df.groupby('user_id'):
    dct[user_id] = [datetime.datetime.strptime(date, "%Y-%m-%d").weekday() for date in group['date']]

  return dct

In [None]:
def us_frequency(dct):
  for key, arr in dct.items():
    arr_freq = [0 for i in range(7)]

    l = len(arr)
    for i in range(l):
      arr_freq[arr[i]] += 1

    for i in range(7):
      arr_freq[i] /= l

    dct[key] = arr_freq

  return dct

In [None]:
def us_sort(dct):
  for key, arr in dct.items():
    dct_freq = {}
    for i in range(len(arr)):
      if arr[i] in dct_freq:
        dct_freq[arr[i]] += 1
      else: dct_freq[arr[i]] = 1

    lst = [key for key, value in sorted(dct_freq.items(), key=lambda item: item[1], reverse=True)]
    dct[key] = lst

  return dct

In [None]:
dct_train_100 = dct_user_group_us(df_train)
dct_train_100 = us_frequency(dct_train_100)

dct_train_80 = dct_user_group_us(train)
dct_train_80 = us_frequency(dct_train_80)

In [None]:
us_preds = {}
us_confidences = {}

for user, days in dct_train_100.items():
    best_match = None
    best_distance = float("inf")

    for train_user, train_days in dct_train_80.items():
        distance = np.sum((np.array(days) - np.array(train_days)) ** 2)

        if distance < best_distance:
            best_distance = distance
            best_match = train_user

    us_preds[user] = best_match
    us_confidences[user] = 1 - best_distance

In [None]:
df_train["us_predict"] = df_train["user_id"].map(us_preds)
df_train["us_confidence"] = df_train["user_id"].map(us_confidences)

In [None]:
pd.set_option('display.max_rows', None)
df_train.head(100)

In [None]:
non_matching_rows = df_train[df_train['user_id'] != df_train['us_predict']]
print(non_matching_rows[['user_id', 'gate_id', 'date', 'time', 'us_predict']])

In [None]:
acc = accuracy_score(df_train['user_id'], df_train['us_predict'])
print(acc)

In [None]:
arr = train_test_split_unique(df_train)
train = arr[0]
test = arr[1]

X_train = train.drop(columns=["user_id", "date", "time"])
y_train = train["user_id"]

model = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, loss_function='MultiClass', verbose=100)
model.fit(X_train, y_train)

In [None]:
X_test = test.drop(columns=["user_id", "date", "time"])
y_test = test["user_id"]

y_pred = model.predict(X_test).astype(int)
acc = accuracy_score(y_pred, y_test)

print(acc)

In [None]:
with zipfile.ZipFile("answer.zip", "w") as zf:
    zf.write("answer.csv")

In [None]:
def freq_per_day(df):
  year = int(df['date'][0][:4])
  d = date(year, 1, 1)

  frq_dct = {}

  while d.year == year:
    d_str = d.strftime('%Y-%m-%d')
    frq_dct[d_str] = 0
    d += timedelta(days=1)

  for row in df['date']:
    if row in frq_dct: frq_dct[row] += 1

  return frq_dct

In [None]:
def woh_workers(woh, df):
  workers = {}

  for day in woh:
    if day in df['date'].values:
      workers[day] = list(set(df[df['date'] == day]['user_id']))

  return workers

In [None]:
woh = get_weekends(2022)

In [None]:
# Праздничные выходные дни, согласно производственному календарю на 22й год
holidays_2022 = {
    '2022-01-03', '2022-01-04', '2022-01-05', '2022-01-06', '2022-01-07',  # Новогодние праздники
    '2022-02-23',               # День защитника Отечества
    '2022-03-07', '2022-03-08', # Международный женский день
    '2022-05-02', '2022-05-03', # Праздник Весны и Труда
    '2022-05-09', '2022-05-10', # День Победы
    '2022-06-13',               # День России
    '2022-11-04',               # День народного единства
}

# Праздничные выходные дни, согласно производственному календарю на 23й год
holidays_2023 = {
    '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06',  # Новогодние праздники
    '2023-02-23', '2023-02-24',  # День защитника Отечества
    '2023-03-08',                 # Международный женский день
    '2023-05-01',                 # Праздник Весны и Труда
    '2023-05-08', '2023-05-09',   # День Победы
    '2023-06-12',                 # День России
    '2023-08-04',                 # без понятия почему он отмечен красным
    '2023-11-06',                 # День народного единства
}

In [None]:
woh |= holidays_2023 | holidays_2022

In [None]:
frequency_per_day = freq_per_day(df_train)

In [None]:
woh_workers(woh, df_train)