# <center> Финальное задание </center>
## <center> Предсказание пола клиента по транзакциям</center>

## Описание задачи
#### Ваше задание - предсказать пол клиента, основываясь на его транзакционных исторических данных. Выполнение финального задания - это маленький шаг в большую Data Science-всесенную, поэтому отнеситесь к нему максимально серьёзно :)
#### В роли метрики выступает [ROC AUC](https://dyakonov.org/2017/07/28/auc-roc-%D0%BF%D0%BB%D0%BE%D1%89%D0%B0%D0%B4%D1%8C-%D0%BF%D0%BE%D0%B4-%D0%BA%D1%80%D0%B8%D0%B2%D0%BE%D0%B9-%D0%BE%D1%88%D0%B8%D0%B1%D0%BE%D0%BA/), который и нужно будет оптимизировать.

In [None]:
!pip install xgboost
!pip install joblib



In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import joblib

from tqdm._tqdm_notebook import tqdm_notebook

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  from tqdm._tqdm_notebook import tqdm_notebook


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH_DATA = '/content/drive/MyDrive/data'
MODEL_PATH = "model.pkl"

In [None]:
# Считываем данные
tr_mcc_codes = pd.read_csv(os.path.join(PATH_DATA, 'mcc_codes.csv'), sep=';', index_col='mcc_code')
tr_types = pd.read_csv(os.path.join(PATH_DATA, 'trans_types.csv'), sep=';', index_col='trans_type')

transactions = pd.read_csv(os.path.join(PATH_DATA, 'transactions.csv'), index_col='client_id')
gender_train = pd.read_csv(os.path.join(PATH_DATA, 'train.csv'), index_col='client_id')
gender_test = pd.read_csv(os.path.join(PATH_DATA, 'test.csv'), index_col='client_id')
transactions_train = transactions.join(gender_train, how='inner')
transactions_test = transactions.join(gender_test, how='inner')

In [None]:
transactions_train = transactions_train.fillna(-1)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import matplotlib.pyplot as plt

from matplotlib.ticker import MaxNLocator

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from torchmetrics.functional import accuracy

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from multiprocessing import cpu_count

In [None]:
LEN_OF_FEATURES = 60
transactions_train = transactions_train.drop('Unnamed: 0', axis=1)
features, target = transactions_train.drop('gender', axis=1), transactions_train['gender']


In [None]:
transactions_train = transactions_train.drop('g', axis=1)

In [None]:
FEATURE_COLUMNS = transactions_train.columns.tolist()[1:-1]
FEATURE_COLUMNS

['mcc_code', 'trans_type', 'amount', 'term_id', 'trans_city']

In [None]:
transactions_train = transactions_train.reset_index()

# Sort by the old index and then by 'trans_time'
transactions_train = transactions_train.sort_values(['client_id', 'trans_time'])

# Optionally, set the old index back as the index
transactions_train = transactions_train.set_index('client_id')

In [None]:
transactions_train

Unnamed: 0_level_0,trans_time,mcc_code,trans_type,amount,term_id,trans_city,gender
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0002cf30347684df542e1a931f356875,10 14:24:11,6011,2010,-2168.99,-1,Saint Petersburg,0
0002cf30347684df542e1a931f356875,10 14:25:35,6011,2010,-722.25,-1,Saint Petersburg,0
0002cf30347684df542e1a931f356875,101 12:27:31,5912,1010,-143.56,-1,Saint Petersburg,0
0002cf30347684df542e1a931f356875,101 12:39:38,5411,1110,-135.98,-1,Saint Petersburg,0
0002cf30347684df542e1a931f356875,102 11:00:51,6011,2010,-3615.32,-1,Saint Petersburg,0
...,...,...,...,...,...,...,...
fffedf876a0ea3d39e54b706165a4826,76 15:14:18,6011,2010,-2168.90,-1,Saint Petersburg,1
fffedf876a0ea3d39e54b706165a4826,79 10:11:50,6011,2010,-2529.91,-1,Saint Petersburg,1
fffedf876a0ea3d39e54b706165a4826,80 12:45:53,6011,2010,-1589.90,-1,Saint Petersburg,1
fffedf876a0ea3d39e54b706165a4826,91 07:18:17,6011,2010,-5784.00,-1,Saint Petersburg,1


In [None]:
def convert_time_to_seconds(time_str):
    days_str, time_str = time_str.split(" ")
    days = int(days_str)
    seconds_per_day = 24 * 60 * 60
    total_seconds = days * seconds_per_day
    time_parts = time_str.split(":")
    hours = int(time_parts[0])
    minutes = int(time_parts[1])
    seconds = int(time_parts[2])
    time_in_seconds = hours * 3600 + minutes * 60 + seconds
    return total_seconds + time_in_seconds

# Примените функцию к столбцу "trans_time" и создайте новый столбец "trans_time_seconds"
transactions_train['trans_time_seconds'] = transactions_train['trans_time'].apply(convert_time_to_seconds)

transactions_train = transactions_train.drop('trans_time', axis =1)
# В результате DataFrame будет содержать новый столбец "trans_time_seconds" с временем в секундах
print(transactions_train)

                                  mcc_code  trans_type   amount term_id  \
client_id                                                                 
0002cf30347684df542e1a931f356875      6011        2010 -2168.99     NaN   
0002cf30347684df542e1a931f356875      6011        2010  -722.25     NaN   
0002cf30347684df542e1a931f356875      5912        1010  -143.56     NaN   
0002cf30347684df542e1a931f356875      5411        1110  -135.98     NaN   
0002cf30347684df542e1a931f356875      6011        2010 -3615.32     NaN   
...                                    ...         ...      ...     ...   
fffedf876a0ea3d39e54b706165a4826      6011        2010 -2168.90     NaN   
fffedf876a0ea3d39e54b706165a4826      6011        2010 -2529.91     NaN   
fffedf876a0ea3d39e54b706165a4826      6011        2010 -1589.90     NaN   
fffedf876a0ea3d39e54b706165a4826      6011        2010 -5784.00     NaN   
fffedf876a0ea3d39e54b706165a4826      6011        2010 -2892.89     NaN   

                        

In [None]:
transactions_train

In [None]:
from sklearn.preprocessing import LabelEncoder

# Создайте экземпляр LabelEncoder для каждого категориального признака
label_encoders = {}
categorical_features = ['trans_type','term_id', 'trans_city']

for feature in categorical_features:
    transactions_train[feature] = transactions_train[feature].astype(str)

for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    transactions_train[feature] = label_encoders[feature].fit_transform(transactions_train[feature])

# Теперь категориальные признаки преобразованы в числовые значения

In [None]:
transactions_train = transactions_train.drop('trans_city', axis =1)
# transactions_train = transactions_train.drop('term_id', axis =1)

transactions_train

Unnamed: 0_level_0,mcc_code,trans_type,amount,term_id,gender,trans_time_seconds
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0002cf30347684df542e1a931f356875,6011,10,-2168.99,285308,0,915851
0002cf30347684df542e1a931f356875,6011,10,-722.25,285308,0,915935
0002cf30347684df542e1a931f356875,5912,1,-143.56,285308,0,8771251
0002cf30347684df542e1a931f356875,5411,4,-135.98,285308,0,8771978
0002cf30347684df542e1a931f356875,6011,10,-3615.32,285308,0,8852451
...,...,...,...,...,...,...
fffedf876a0ea3d39e54b706165a4826,6011,10,-2168.90,285308,1,6621258
fffedf876a0ea3d39e54b706165a4826,6011,10,-2529.91,285308,1,6862310
fffedf876a0ea3d39e54b706165a4826,6011,10,-1589.90,285308,1,6957953
fffedf876a0ea3d39e54b706165a4826,6011,10,-5784.00,285308,1,7888697


In [None]:
import numpy as np

max_seq_length = 40  # Максимальная длина последовательности
padding_value = 0   # Значение для паддинга

# Создайте словарь для хранения обрезанных и заполненных данных
sequences = {}
grouped_transactions = transactions_train.groupby(transactions_train.index)

for client, client_data in tqdm(grouped_transactions):
    gender_val = client_data.gender.min()
    client_data = client_data.drop('gender', axis=1)

    if len(client_data) < max_seq_length:
        # Если последовательность короткая, добавьте паддинги
        padding = np.full((max_seq_length - len(client_data), client_data.shape[1]), padding_value)
        padded_data = np.concatenate((client_data.values, padding), axis=0)
        sequences[client] = [padded_data, gender_val]
    else:
        # Если последовательность длиннее 20, обрежьте ее до 20
        sequences[client] = [client_data.iloc[:max_seq_length].values, gender_val]


In [None]:
test = list(sequences.values())
test[213][0]

In [None]:
class TransactionsDataset(Dataset):
    def __init__(self, sequences):
        super().__init__()
        self.sequences = list(sequences.values())

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        item = self.sequences[idx]
        print(item[1])
        gender_val, sequence = item[1], item[0]


        mcc_code,	trans_type,	amount,	term_id, time = sequence[:, 0], sequence[:, 1], sequence[:, 2], sequence[:, 3], sequence[:, 4]
        return torch.Tensor(np.array(mcc_code)), torch.Tensor(np.array(trans_type)), torch.Tensor(np.array(amount)), torch.Tensor(np.array(term_id)), torch.Tensor(np.array(time)), torch.Tensor(gender_val)


In [None]:
dataset = TransactionsDataset(sequences)
dataset[1]

In [None]:
test_size = 0.1

BATCH_SIZE = 4
train_dataset, test_dataset = random_split(dataset, [1-test_size, test_size])

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import torch.nn as nn

class GenderClassifierLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GenderClassifierLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm1 = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.lstm3 = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Initialize hidden and cell states for all three LSTM layers
        h0_1 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0_1 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        h0_2 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0_2 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        h0_3 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0_3 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate through the first LSTM layer
        out1, _ = self.lstm1(x, (h0_1, c0_1))

        # Forward propagate through the second LSTM layer
        out2, _ = self.lstm2(out1, (h0_2, c0_2))

        # Forward propagate through the third LSTM layer
        out3, _ = self.lstm3(out2, (h0_3, c0_3))

        # Decode the hidden state of the last time step from the third LSTM layer
        out = self.fc(out3[:, -1, :])
        return self.sigmoid(out)


In [None]:
def validate_model(model, test_loader, device):
    # Set the model to evaluation mode
    model.eval()

    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for sequence, gender_val in test_loader:
            sequence, gender_val = sequence.to(device), gender_val.to(device)

            outputs = model(sequence)

            predicted_labels = (outputs[:, 0] > 0.5).to(torch.int64)
            total_correct += (predicted_labels == gender_val).sum().item()


    accuracy = 100 * total_correct / (len(test_loader)*16)
    model.train()
    return accuracy


In [None]:
vocab_size_mcc_code = len(transactions_train.mcc_code.unique())
vocab_size_trans_type = len(transactions_train.trans_type.unique())
vocab_size_term_id = len(transactions_train.term_id.unique())
# vocab_size_trans_city = len(transactions_train.trans_city.unique())
embedding_dim = 64

In [None]:
checkpoint_dir = "checkpoints/"

# Ensure the directory exists
os.makedirs(checkpoint_dir, exist_ok=True)

#define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GenderClassifierLSTM(4, 128, 4, 2, vocab_size_mcc_code, vocab_size_trans_type,
                 vocab_size_term_id, embedding_dim)

# criterion = nn.CrossEntropyLoss() 
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
model.to(device)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)
print(device)

cuda


In [None]:
len(train_dataloader)

1701

In [None]:
N_EPOCHS = 250
for epoch in range(N_EPOCHS):
    epoch_loss = 0.0
    for sequence in (pbar := tqdm(train_dataloader)):
        outputs = model(sequence)
        ouputs = outputs.argmax(dim=1)
        loss = criterion(outputs, gender_val)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        epoch_loss += loss.item()

    scheduler.step()
    val_loss = validate_model(model, test_dataloader,  device)

    print(f"Epoch: {epoch}\tLoss: {epoch_loss / len(train_dataloader)}\t accuracy: {1/ len(test_dataloader)}")

    checkpoint_filename = os.path.join(checkpoint_dir, f"{epoch}_checkpoint_detection.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': epoch_loss,
    }, checkpoint_filename)

In [None]:
# Функции, которыми можно пользоваться для построения классификатора,
# оценки его результатов и построение прогноза для тестовой части пользователей

# Cross-validation score (среднее значение метрики ROC AUC на тренировочных данных)
def cv_score(params, train, y_true):
    cv_res=xgb.cv(params, xgb.DMatrix(train, y_true),
                  early_stopping_rounds=10, maximize=True,
                  num_boost_round=10000, nfold=5, stratified=True)
    index_argmax = cv_res['test-auc-mean'].argmax()
    print('Cross-validation, ROC AUC: {:.3f}+-{:.3f}, Trees: {}'.format(cv_res.loc[index_argmax]['test-auc-mean'],
                                                                        cv_res.loc[index_argmax]['test-auc-std'],
                                                                        index_argmax))

# Построение модели + возврат результатов классификации тестовых пользователей
def fit_predict(params, num_trees, train, test, target):
    params['learning_rate'] = params['eta']
    clf = xgb.train(params, xgb.DMatrix(train.values, target, feature_names=list(train.columns)),
                    num_boost_round=num_trees, maximize=True)
    y_pred = clf.predict(xgb.DMatrix(test.values, feature_names=list(train.columns)))
    submission = pd.DataFrame(index=test.index, data=y_pred, columns=['probability'])

    joblib.dump(clf, MODEL_PATH)
    return clf, submission

# Отрисовка важности переменных. Важность переменной - количество разбиений выборки,
# в которых участвует данная переменная. Чем больше - тем она, вероятно, лучше
def draw_feature_importances(clf, top_k=10):
    plt.figure(figsize=(10, 10))

    importances = dict(sorted(clf.get_score().items(), key=lambda x: x[1])[-top_k:])
    y_pos = np.arange(len(importances))

    plt.barh(y_pos, list(importances.values()), align='center', color='green')
    plt.yticks(y_pos, importances.keys(), fontsize=12)
    plt.xticks(fontsize=12)
    plt.xlabel('Feature importance', fontsize=15)
    plt.title('Features importances, Sberbank Gender Prediction', fontsize=18)
    plt.ylim(-0.5, len(importances) - 0.5)
    plt.show()