# Машинное обучение для текстов. Проект с BERT

In [None]:
import numpy as np
import pandas as pd
import torch
import transformers
from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [None]:
df_tweets = pd.read_csv('toxic_comments.csv')
#df_tweets = df_tweets.sample(2000).reset_index(drop=True) 
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [None]:
class_ratio = df_tweets['toxic'].value_counts()[0] / df_tweets['toxic'].value_counts()[1]
class_ratio

8.834884437596301

In [None]:
tokenizer = transformers.BertTokenizer(
    vocab_file='vocab.txt')

tokenized = df_tweets['text'].apply(
    lambda x: tokenizer.encode(x, padding=True, truncation=True, max_length=512, add_special_tokens=True))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [None]:
config = transformers.BertConfig.from_json_file(
    'config.json')
model = transformers.BertModel.from_pretrained(
    'pytorch_model.bin', config=config)

Some weights of the model checkpoint at pytorch_model.bin were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
torch.cuda.empty_cache()

model = model.to(device)

In [None]:
batch_size = 1
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.cuda.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.cuda.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())
        

  0%|          | 0/159571 [00:00<?, ?it/s]

In [None]:
features = np.concatenate(embeddings)
target = df_tweets['toxic']

features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=12345)

In [None]:
model = LogisticRegression(random_state=12345, max_iter=5000)
model.fit(features_train, target_train)
predictions = model.predict(features_test)
score = f1_score(target_test, predictions)
print(score)

0.7192429022082019


In [None]:
model = LogisticRegression(solver='newton-cg',random_state=12345, penalty='none', max_iter=5000)
model.fit(features_train, target_train)
predictions = model.predict(features_test)
score = f1_score(target_test, predictions)
print(score)



0.7220090687129403
