### 1. Install and import the required packages

In [None]:
!pip install transformers sentence-transformers datasets

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m908.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import datetime
import random
import numpy as np
import pandas as pd

### 2. Use Google Colab's GPU for training

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


### **3.** Load and preview the Semantic Textual Similarity Benchmark (STSB) dataset

In [None]:
# Load the English version of the STSB dataset
dataset = load_dataset("stsb_multi_mt", "ru")

Downloading builder script:   0%|          | 0.00/7.43k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/19.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.98k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/333k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/101k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})


In [None]:
print("A sample from the STSB dataset's training split:")
print(dataset['train'][98])

A sample from the STSB dataset's training split:
{'sentence1': 'Мужчина режет картошку.', 'sentence2': 'Женщина чистит картошку.', 'similarity_score': 2.200000047683716}


### **4.** Define the dataset loader class


In [None]:
# Instantiate the BERT tokenizer
# You can use larger variants of the model, here we're using the base model
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [None]:
class STSBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset):

        # Normalize the similarity scores in the dataset
        similarity_scores = [i['similarity_score'] for i in dataset]
        self.normalized_similarity_scores = [i/5.0 for i in similarity_scores]
        self.first_sentences = [i['sentence1'] for i in dataset]
        self.second_sentences = [i['sentence2'] for i in dataset]
        self.concatenated_sentences = [[str(x), str(y)] for x,y in zip(self.first_sentences, self.second_sentences)]

    def __len__(self):
        return len(self.concatenated_sentences)

    def get_batch_labels(self, idx):
        return torch.tensor(self.normalized_similarity_scores[idx])

    def get_batch_texts(self, idx):
        return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y


def collate_fn(texts):
    input_ids = texts['input_ids']
    attention_masks = texts['attention_mask']
    features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]
    return features

### 5. Define the model class based on BERT

In [None]:
class BertForSTS(torch.nn.Module):

    def __init__(self):
        super(BertForSTS, self).__init__()
        self.bert = models.Transformer('DeepPavlov/rubert-base-cased', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding']
        return output

In [None]:
# Instantiate the model and move it to GPU
model = BertForSTS()
model.to(device)

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)

### 6. Define the Cosine Similarity loss function

In [None]:
class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fn=torch.nn.MSELoss(), transform_fn=torch.nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn
        self.cos_similarity = torch.nn.CosineSimilarity(dim=1)

    def forward(self, inputs, labels):
        emb_1 = torch.stack([inp[0] for inp in inputs])
        emb_2 = torch.stack([inp[1] for inp in inputs])
        outputs = self.transform_fn(self.cos_similarity(emb_1, emb_2))
        return self.loss_fn(outputs, labels.squeeze())

### 7. Prepare the training and validation data split

In [None]:
train_ds = STSBDataset(dataset['train'])
val_ds = STSBDataset(dataset['dev'])

# Create a 90-10 train-validation split.
train_size = len(train_ds)
val_size = len(val_ds)

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

5,749 training samples
1,500 validation samples


In [None]:
batch_size = 8

train_dataloader = DataLoader(
            train_ds,  # The training samples.
            num_workers = 4,
            batch_size = batch_size, # Use this batch size.
            shuffle=True # Select samples randomly for each batch
        )

validation_dataloader = DataLoader(
            val_ds,
            num_workers = 4,
            batch_size = batch_size # Use the same batch size
        )



### 8. Define the Optimizer and Scheduler

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-6)

In [None]:
epochs = 8

# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

### 9. Define a helper function for formatting the elapsed training time as `hh:mm:ss`

In [None]:
# Takes a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

### 10. Define the training function, and start the training loop

In [None]:
def train():
  seed_val = 42

  criterion = CosineSimilarityLoss()
  criterion = criterion.to(device)

  random.seed(seed_val)
  torch.manual_seed(seed_val)

  # We'll store a number of quantities such as training and validation loss,
  # validation accuracy, and timings.
  training_stats = []
  total_t0 = time.time()

  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      # For each batch of training data...
      for train_data, train_label in tqdm(train_dataloader):

          train_data['input_ids'] = train_data['input_ids'].to(device)
          train_data['attention_mask'] = train_data['attention_mask'].to(device)

          train_data = collate_fn(train_data)
          model.zero_grad()

          output = [model(feature) for feature in train_data]

          loss = criterion(output, train_label.to(device))
          total_train_loss += loss.item()

          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()


      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)

      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.5f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))

      # ========================================
      #               Validation
      # ========================================

      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      total_eval_accuracy = 0
      total_eval_loss = 0
      nb_eval_steps = 0

      # Evaluate data for one epoch
      for val_data, val_label in tqdm(validation_dataloader):

          val_data['input_ids'] = val_data['input_ids'].to(device)
          val_data['attention_mask'] = val_data['attention_mask'].to(device)

          val_data = collate_fn(val_data)

          with torch.no_grad():
              output = [model(feature) for feature in val_data]

          loss = criterion(output, val_label.to(device))
          total_eval_loss += loss.item()

      # Calculate the average loss over all of the batches.
      avg_val_loss = total_eval_loss / len(validation_dataloader)

      # Measure how long the validation run took.
      validation_time = format_time(time.time() - t0)

      print("  Validation Loss: {0:.5f}".format(avg_val_loss))
      print("  Validation took: {:}".format(validation_time))

      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )

  print("")
  print("Training complete!")

  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

  return model, training_stats

In [None]:
# Launch the training
model, training_stats = train()


Training...


100%|██████████| 719/719 [05:55<00:00,  2.02it/s]



  Average training loss: 0.05242
  Training epoch took: 0:05:56

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.18it/s]


  Validation Loss: 0.04468
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:54<00:00,  2.03it/s]



  Average training loss: 0.04015
  Training epoch took: 0:05:54

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.21it/s]


  Validation Loss: 0.04012
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:54<00:00,  2.03it/s]



  Average training loss: 0.03570
  Training epoch took: 0:05:55

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.18it/s]


  Validation Loss: 0.03828
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:53<00:00,  2.03it/s]



  Average training loss: 0.03250
  Training epoch took: 0:05:54

Running Validation...


100%|██████████| 188/188 [00:25<00:00,  7.23it/s]


  Validation Loss: 0.03669
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:54<00:00,  2.03it/s]



  Average training loss: 0.03014
  Training epoch took: 0:05:54

Running Validation...


100%|██████████| 188/188 [00:25<00:00,  7.24it/s]


  Validation Loss: 0.03588
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:53<00:00,  2.04it/s]



  Average training loss: 0.02894
  Training epoch took: 0:05:53

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.22it/s]


  Validation Loss: 0.03553
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:53<00:00,  2.03it/s]



  Average training loss: 0.02786
  Training epoch took: 0:05:54

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.18it/s]


  Validation Loss: 0.03515
  Validation took: 0:00:26

Training...


100%|██████████| 719/719 [05:52<00:00,  2.04it/s]



  Average training loss: 0.02784
  Training epoch took: 0:05:53

Running Validation...


100%|██████████| 188/188 [00:26<00:00,  7.23it/s]

  Validation Loss: 0.03519
  Validation took: 0:00:26

Training complete!
Total training took 0:50:42 (h:mm:ss)





In [None]:
# Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

# Display the table
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.052424,0.044685,0:05:56,0:00:26
2,0.040147,0.040125,0:05:54,0:00:26
3,0.035699,0.03828,0:05:55,0:00:26
4,0.032497,0.036693,0:05:54,0:00:26
5,0.030141,0.035884,0:05:54,0:00:26
6,0.028935,0.035532,0:05:53,0:00:26
7,0.027857,0.035146,0:05:54,0:00:26
8,0.027838,0.035186,0:05:53,0:00:26


In [None]:
test_dataset = load_dataset("stsb_multi_mt", name="ru", split="test")

# Prepare the data
first_sent = [i['sentence1'] for i in test_dataset]
second_sent = [i['sentence2'] for i in test_dataset]
full_text = [[str(x), str(y)] for x,y in zip(first_sent, second_sent)]

In [None]:
model.eval()

def predict_similarity(sentence_pair):
  test_input = tokenizer(sentence_pair, padding='max_length', max_length = 128, truncation=True, return_tensors="pt").to(device)
  test_input['input_ids'] = test_input['input_ids']
  test_input['attention_mask'] = test_input['attention_mask']
  del test_input['token_type_ids']
  output = model(test_input)
  sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()

  return sim

In [None]:
example_1 = full_text[100]
print(f"Sentence 1: {example_1[0]}")
print(f"Sentence 2: {example_1[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_1), 2)}")

Sentence 1: Кошка гуляет по дому.
Sentence 2: Женщина чистит картошку.
Predicted similarity score: 0.22


In [None]:
example_2 = full_text[130]
print(f"Sentence 1: {example_2[0]}")
print(f"Sentence 2: {example_2[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_2), 2)}")

Sentence 1: Двое мужчин играют в футбол.
Sentence 2: Двое мужчин занимаются футболом.
Predicted similarity score: 0.78


In [None]:
example_3 = full_text[812]
print(f"Sentence 1: {example_3[0]}")
print(f"Sentence 2: {example_3[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_3), 2)}")

Sentence 1: Это зависит от ситуации.
Sentence 2: Это варьируется в зависимости от учреждения.
Predicted similarity score: 0.49


### Last but not least, save your model!

In [None]:
from google.colab import drive
drive.mount('/content/drive')
home = "/content/drive/My Drive/Colab Notebooks/hackaton/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH = home + 'bert-sts.pt'
torch.save(model.state_dict(), PATH)

In [None]:
# In order to load the model
# First, you have to create an instance of the model's class
# And use the saving path for the loading
# Don't forget to set the model to the evaluation state using .eval()
model = BertForSTS()
model.load_state_dict(torch.load(PATH))
model.to(device)
model.eval()

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)

In [None]:
import pandas as pd

df_copy = pd.read_csv(home+'data_cleaned4_combined.csv')
df_copy['combined'] = df_copy['combined'].fillna('')

In [None]:
df_copy['combined']

0        владимир кунин интер девочка иванов рабинович ...
1        хотеть болото спросить аспирант лаборант-иссле...
2                                                         
3        пытаться научиться расставлять приоритет любим...
4                                                         
                               ...                        
62624                                                     
62625                                                     
62626    целеустремлённый ответственнаявыносливать кому...
62627                                                     
62628                                                     
Name: combined, Length: 62629, dtype: object

In [None]:
top_n = 10
country = 'Россия'
city = 'Санкт-Петербург'
about = 'программист'
activities = 'люблю программировать'
books = 'грокаем алгоритмы'
games = 'fifa, counter-strike'
interests = 'программирование, сноуборд, шахматы'
education_form = 'Очное отделение'
education_status = 'Студент'

In [None]:
!pip install pymorphy2

In [None]:
import re

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

import pymorphy2

nltk.download('punkt')
nltk.download('stopwords')

stopwords_ru = stopwords.words('russian')
morph = pymorphy2.MorphAnalyzer()

def clean_text(text: str) -> str:
    text = re.sub(r'((http|ftp)\S+)|[^a-zа-яё\s/-]|(-{2,})', '', text.lower())
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def remove_stopwords(tokens: list, stopwords=None) -> list:
    if not stopwords:
        return tokens

    stopwords = set(stopwords)
    tokens = [tok for tok in tokens if tok not in stopwords and len(tok)>2]

    return tokens

def text_preparation(text: str) -> str:
    text = clean_text(text)
    tokens = word_tokenize(text, language='russian')
    tokens = remove_stopwords(tokens, stopwords_ru)
    tokens = [morph.parse(tok)[0].normal_form for tok in tokens]

    return ' '.join(tokens)

In [None]:
user_input = about + ' ' + activities + ' ' + books + ' ' + games + ' ' + interests
user_input = text_preparation(user_input)

In [None]:
def calc_similarity(row):
    return predict_similarity([row['combined'], user_input])

In [None]:
df_copy['similarity'] = [0.0 for i in range(df_copy.shape[0])]
df_copy['similarity'] = df_copy.apply(calc_similarity, axis=1)
df_copy['similarity'].sort_values(ascending=False)

48831    0.685634
23103    0.667006
45718    0.658237
35785    0.655627
45185    0.651596
           ...   
46276   -0.017179
11894   -0.027391
38985   -0.029244
31405   -0.029662
14352   -0.038552
Name: similarity, Length: 62629, dtype: float64

In [None]:
user_input

'программист любить программировать грокать алгоритм fifa counter-strike программирование сноуборд шахматы'

In [None]:
df_copy.sort_values(by=['similarity'], ascending=False)[:10]

Unnamed: 0,country,city,about,activities,books,games,interests,education_form,education_status,university_name,faculty_name,graduation,same_universities_idx,combined,similarity
48831,Россия,Череповец,ничего личного спрашивайте в личку если что,,S.T.A.L.K.E.R по лазейкам в игре самой,Counter-Strike 1.6;Warcraft S.T.A.L.K.E.R дота...,"любля гулять, играть и очень сильно интересуют...",Очное отделение,Студент (специалист),ЧГУ,Металлургический,2009,132,личный спрашивать личка stalker лазейка игра с...,0.685634
23103,Россия,Видяево,ПрОфИсИоНаЛьНыЙ ФуДбОлИсТ,Учусь и хожу на фудбол,,Counter-Srike 1.6,онлайн игры,Очное отделение,Студент (специалист),РХТУ им. Д. И. Менделеева,Инженерный химико-технологический,2016,98,профисиональный фудболист учиться ходить фудбо...,0.667006
45718,Россия,Москва,,,Каэнпе и капе,Frontline Tactics,Имею цель отлично играть в шахматы,Очное отделение,Выпускник,НГУЭУ,Машиностроительный факультет,0,236,каэнп капа frontline tactics иметь цель отличн...,0.658237
35785,Россия,Воронеж,"я русый,глаза зелёные,люблю играть в комп!","школьник, занимаюсь стрельбой",сталкер,"сталкер, nfs, barnaut, flatout, splinter cell","компы, стрельба из базуки, сталкер",Очное отделение,Абитуриент,ВГУ,Юридический,0,35,русыйглаз зелёныелюбить играть комп школьник з...,0.655627
45185,Россия,Москва,,,"Дачные правила, двадцать шесть (Выписки из дне...",Sharf,Очень хочу отлично играть в шашки,Очное отделение,Выпускник,"СЗГМУ им. Мечникова (бывш. СПбГМА, СПбМАПО, ЛС...",Социально-культурной деятельности,0,165,дачный правило двадцать шесть выписка дневник ...,0.651596
8631,Россия,Москва,,,"Последний черт, потерянный день, правая рука",Football Manager 2016,Очень хочу обучиться игре в компьютерные игры,Очное отделение,Выпускник,МГУ,Специальной педагогики и психологии,0,0,последний черта потерянный день правый рука fo...,0.646509
45206,Россия,Воронеж,,,Алмазный язык,Football Superstars,Очень хочу научиться играть компьютерные игры,Очное отделение,Аспирант,"СЗГМУ им. Мечникова (бывш. СПбГМА, СПбМАПО, ЛС...",Институт управления и предпринемательства,0,165,алмазный язык football superstars очень хотеть...,0.644241
21042,Россия,Москва,"Эволюционируем потихоньку (непонятно, в каком ...",Инженер систем и программист программ,"Лукьяненко, Стругацкие, Азимов","Игры Nintendo, игры серии DOOM, игры серии Ace...","Программирование, видеоигры, сборка компьютеров",Очное отделение,Выпускник (специалист),НИЯУ МИФИ,Факультет автоматики и электроники (А),2020,50,эволюционировать потихоньку непонятно какой на...,0.644181
46453,Россия,Москва,,,Бриз,Endorlight,Имею цель отлично играть в шашки и шахматы,Очно-заочное отделение,Студент (специалист),СибГМУ (ТМИ),Юридический,0,568,бриз endorlight иметь цель отлично играть шашк...,0.644051
48741,Россия,Талдом,О себе или хорошо или ничего.,Деятельность и участие,Читаю всё подряд,"Шахматы, TF2, Сапёр","Астрофизика, Detection of the Dark Matter",Заочное отделение,Выпускник (специалист),МИРЭА (до 2015),Заочной подготовки,2012,76,деятельность участие читать всё подряд шахматы...,0.642466
