In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
from keras.datasets import imdb
from gensim.downloader import load
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')




In [1]:
"""
Будем решать задачу классификации отзывов на фильмы позитивные/негативные.
"""

'\nБудем решать задачу классификации отзывов на фильмы позитивные/негативные.\n'

In [3]:
(X_train_indices, y_train), (X_test_indices, y_test) = imdb.load_data(num_words=10000)

In [4]:
X_train_indices

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228,

In [5]:
# Получение словаря, сопоставляющего индексы со словами
word_index = imdb.get_word_index()
index_to_word = {index: word for word, index in word_index.items()}

# Функция для преобразования последовательности индексов в последовательность слов
def indices_to_words(indices):
    return ' '.join(index_to_word.get(index, "") for index in indices)

# Преобразование последовательностей индексов в последовательности слов
X_train = [indices_to_words(indices) for indices in X_train_indices]
X_test = [indices_to_words(indices) for indices in X_test_indices]

In [9]:
# сокращения в отзывах
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laughter",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "LOL": "Laughing out loud",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don’t care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "IDC": "I don’t care",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "LMAO": "Laughing my a** off",
    "BFF": "Best friends forever",
    "CSL": "Can’t stop laughing",
}

In [10]:
chat_words.keys()

dict_keys(['AFAIK', 'AFK', 'ASAP', 'ATK', 'ATM', 'A3', 'BAK', 'BBL', 'BBS', 'BFN', 'B4N', 'BRB', 'BRT', 'BTW', 'B4', 'CU', 'CUL8R', 'CYA', 'FAQ', 'FC', 'FWIW', 'FYI', 'GAL', 'GG', 'GN', 'GMTA', 'GR8', 'G9', 'IC', 'ICQ', 'ILU', 'IMHO', 'IMO', 'IOW', 'IRL', 'LDR', 'LMAO', 'LOL', 'LTNS', 'L8R', 'MTE', 'M8', 'NRN', 'OIC', 'PITA', 'PRT', 'PRW', 'QPSA', 'ROFL', 'ROFLOL', 'ROTFLMAO', 'SK8', 'STATS', 'ASL', 'THX', 'TTFN', 'TTYL', 'U2', 'U4E', 'WB', 'WTF', 'WTG', 'WUF', 'W8', '7K', 'TFW', 'MFW', 'MRW', 'IFYP', 'TNTL', 'JK', 'IDC', 'ILY', 'IMU', 'ADIH', 'ZZZ', 'WYWH', 'BAE', 'FIMH', 'BSAAW', 'BWL', 'BFF', 'CSL'])

In [11]:
i = 0
for text in X_train:
    for word in text.split():
        if word.upper() in list(chat_words.keys()):
            i+=1

i

475

In [12]:
X_train[0].split()

['the',
 'as',
 'you',
 'with',
 'out',
 'themselves',
 'powerful',
 'lets',
 'loves',
 'their',
 'becomes',
 'reaching',
 'had',
 'journalist',
 'of',
 'lot',
 'from',
 'anyone',
 'to',
 'have',
 'after',
 'out',
 'atmosphere',
 'never',
 'more',
 'room',
 'and',
 'it',
 'so',
 'heart',
 'shows',
 'to',
 'years',
 'of',
 'every',
 'never',
 'going',
 'and',
 'help',
 'moments',
 'or',
 'of',
 'every',
 'chest',
 'visual',
 'movie',
 'except',
 'her',
 'was',
 'several',
 'of',
 'enough',
 'more',
 'with',
 'is',
 'now',
 'current',
 'film',
 'as',
 'you',
 'of',
 'mine',
 'potentially',
 'unfortunately',
 'of',
 'you',
 'than',
 'him',
 'that',
 'with',
 'out',
 'themselves',
 'her',
 'get',
 'for',
 'was',
 'camp',
 'of',
 'you',
 'movie',
 'sometimes',
 'movie',
 'that',
 'with',
 'scary',
 'but',
 'and',
 'to',
 'story',
 'wonderful',
 'that',
 'in',
 'seeing',
 'in',
 'character',
 'to',
 'of',
 '70s',
 'musicians',
 'with',
 'heart',
 'had',
 'shadows',
 'they',
 'of',
 'here',
 

1. Попробовать в конце FastText
2. Этапы предобработки:
    - Замена сокращений
    - Удаление знаков препинания *
    - Удаление чисел *
    - Удаление пробелов *
    - Токенизация
    - Лемматизация
    
Все, что сделаю на train надо и на test, по крайней мере - сами замены сокращений

In [15]:
text = '12 ,   CSL starting'
text

'12 ,   CSL starting'

In [16]:
'12'.isdigit()

True

In [24]:
import string


In [25]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
import re
from tqdm import tqdm
import string
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

tokenizer = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()


def remove_digit(text: str) -> str:
    return ' '.join([word for word in text.split() if not word.isdigit()])

def remove_notalpha(text: str) -> str:
    return ' '.join([word for word in text.split() if word.isalpha()])

def remove_punct(text: str) -> str:
    return ' '.join([word for word in text.split() if word not in string.punctuation])
    
def remove_space(text: str) -> str:
    return re.sub('\s+', ' ', text)

def replace_chat_words(text: str) -> str:
    return " ".join([chat_words.get(word.upper(), word).lower() for word in text.split()])

def tokenize(text: str) -> list:
    return tokenizer.tokenize(text)

def lemmatize(tokenize_words: list) -> str:
    return " ".join([lemmatizer.lemmatize(word) for word in tokenize_words])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alex\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alex\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
class DatasetImdb(Dataset):
    def __init__(self, texts, labels, tokenizer, lemmatizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.lemmatizer = lemmatizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        text = remove_digit(text)
        text = remove_notalpha(text)
        text = remove_punct(text)
        text = replace_chat_words(text)
        text = remove_space(text)
        words = tokenize(text)
        text = lemmatize(words)
        return text, label
        
        

In [19]:
train_dataset = DatasetImdb(X_train, y_train, tokenizer, lemmatizer)
test_dataset = DatasetImdb(X_test, y_test, tokenizer, lemmatizer)

In [20]:
X_train[0]

"the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart shows to years of every never going and help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other and in of seen over landed for anyone of and br show's to whether from than out themselves history he name half some br of and odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but when from one bit then have t

In [21]:
X_train_prep = []
for idx in tqdm(range(len(train_dataset))):
    text, _ = train_dataset[idx]  # Получаем текст без метки
    X_train_prep.append(text)



100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:29<00:00, 838.46it/s]


In [22]:
# Обработка данных в test_dataset
X_test_prep = []
for idx in tqdm(range(len(test_dataset))):
    text, _ = test_dataset[idx]  # Получаем текст без метки
    X_test_prep.append(text)

100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [00:27<00:00, 900.13it/s]


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

Count_vectorizer = CountVectorizer(max_features=10000)

In [24]:
vectorizer = CountVectorizer(max_features=10000)
X_train_bow = vectorizer.fit_transform(X_train_prep).toarray()
X_test_bow = vectorizer.transform(X_test_prep).toarray()

In [25]:
X_train_bow.shape

(25000, 8447)

In [37]:
X_test_bow.shape

(25000, 8447)

In [None]:
"""
Обучим небольшую нейронную сеть на задачу классификации токсичных отзывов
"""

In [27]:
from sklearn.metrics import accuracy_score
import torch.nn as nn
import torch

In [112]:


class SimpleNeuralNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNeuralNet, self).__init__()
        self.fc = nn.Linear(input_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc(x)
        out = self.sigmoid(out)
        return out.squeeze()

model = SimpleNeuralNet(input_dim=9774)


def calculate_accuracy(outputs, labels):
    threshold = 0.5
    predicted = (outputs >= threshold)
    correct = (predicted == labels)
    accuracy = correct.sum() / len(labels)
    return accuracy.item()
    
    
epochs = 10
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
def train_neural_net(model,
                     X_train, y_train,
                     X_test, y_test,
                     input_dim=9000, epochs=10,
                     criterion=criterion,
                     optimizer=optimizer):
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
   


    for epoch in tqdm(range(epochs)):
        loss_epoch = 0
        model.train()
        optimizer.zero_grad()
        pred = model(X_train)
        loss = criterion(pred, y_train)
        loss.backward()
        optimizer.step()
        train_accuracy = calculate_accuracy(pred, y_train)
        
        print(f"Epoch: {epoch}/{epochs}, Loss: {loss.item():.4f}, accuracy_train: {train_accuracy}")
        
        if epoch % 2 == 0:
            model.eval()
            with torch.no_grad():
                test_pred = model(X_test)
                test_loss = criterion(test_pred, y_test)
                test_accuracy = calculate_accuracy(test_pred, y_test)
                print(f'Test Loss: {test_loss.item():.4f}, accuracy_test: {test_accuracy}')
                
        

train_neural_net(model,
                 X_train_bow, y_train,
                 X_test_bow, y_test,
                 input_dim=9000, epochs=10,
                 criterion=criterion,
                 optimizer=optimizer)

 10%|████████▎                                                                          | 1/10 [00:00<00:02,  3.24it/s]

Epoch: 0/10, Loss: 0.7026, accuracy_train: 0.483240008354187
Test Loss: 0.6888, accuracy_test: 0.5388799905776978


 20%|████████████████▌                                                                  | 2/10 [00:00<00:01,  4.04it/s]

Epoch: 1/10, Loss: 0.6883, accuracy_train: 0.5424799919128418
Epoch: 2/10, Loss: 0.6775, accuracy_train: 0.58815997838974


 30%|████████████████████████▉                                                          | 3/10 [00:00<00:01,  3.63it/s]

Test Loss: 0.6705, accuracy_test: 0.626479983329773


 40%|█████████████████████████████████▏                                                 | 4/10 [00:01<00:01,  3.96it/s]

Epoch: 3/10, Loss: 0.6666, accuracy_train: 0.6412400007247925


 50%|█████████████████████████████████████████▌                                         | 5/10 [00:01<00:01,  3.44it/s]

Epoch: 4/10, Loss: 0.6556, accuracy_train: 0.7102800011634827
Test Loss: 0.6521, accuracy_test: 0.7296800017356873


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [00:01<00:01,  3.70it/s]

Epoch: 5/10, Loss: 0.6453, accuracy_train: 0.7557600140571594
Epoch: 6/10, Loss: 0.6359, accuracy_train: 0.7712399959564209


 70%|██████████████████████████████████████████████████████████                         | 7/10 [00:01<00:00,  3.69it/s]

Test Loss: 0.6368, accuracy_test: 0.7448400259017944


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [00:02<00:00,  3.72it/s]

Epoch: 7/10, Loss: 0.6270, accuracy_train: 0.7730799913406372
Epoch: 8/10, Loss: 0.6185, accuracy_train: 0.7757200002670288


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:02<00:00,  3.94it/s]

Test Loss: 0.6225, accuracy_test: 0.7549600005149841
Epoch: 9/10, Loss: 0.6101, accuracy_train: 0.7833999991416931





In [113]:
torch.save(model.state_dict(), 'model.pt')

In [114]:
""" Теперь возьмем предобученный Word2Vec, получим из него эмбеддинги слов """

' Теперь возьмем предобученный Word2Vec, получим из него эмбеддинги слов '

In [28]:
word2vec_model = load('word2vec-google-news-300')

In [29]:
word2vec_model.similar_by_vector(word2vec_model['Google'])

[('Google', 1.0000001192092896),
 ('Google_Nasdaq_GOOG', 0.7819362878799438),
 ('Google_GOOG', 0.7756521105766296),
 ('Google_NASDAQ_GOOG', 0.7557772397994995),
 ('Google_NSDQ_GOOG', 0.7538511753082275),
 ('Yahoo', 0.7491979598999023),
 ('GoogleGoogle', 0.7281472086906433),
 ('search_engine', 0.7255110740661621),
 ('Google_nasdaq_GOOG', 0.701485276222229),
 ('Baidu', 0.6993466019630432)]

In [40]:
word2vec_model.similar_by_vector(word2vec_model['Google'] - word2vec_model['Apple'])

[('Google', 0.5193392038345337),
 ('search_engine', 0.5044638514518738),
 ('search_engines', 0.44424811005592346),
 ('Picsearch', 0.404263973236084),
 ('Webcrawler', 0.39562055468559265),
 ('Search_Engine', 0.3874809443950653),
 ('google.com', 0.3792419731616974),
 ('AskJeeves', 0.3790159225463867),
 ('HotBot', 0.3785134255886078),
 ('MSN', 0.3783949911594391)]

In [41]:
word2vec_model.similar_by_vector(word2vec_model["Paris"] - word2vec_model["France"] + word2vec_model["Germany"])

[('Berlin', 0.7628204822540283),
 ('Frankfurt', 0.7316751480102539),
 ('Dusseldorf', 0.6983391046524048),
 ('Paris', 0.6756227612495422),
 ('Munich', 0.6736832857131958),
 ('Germany', 0.6483182907104492),
 ('Cologne', 0.6413757801055908),
 ('Düsseldorf', 0.6358515024185181),
 ('Stuttgart', 0.6339588165283203),
 ('Budapest', 0.6204262971878052)]

In [42]:
word2vec_model.get_vector

<bound method KeyedVectors.get_vector of <gensim.models.keyedvectors.KeyedVectors object at 0x000001FD14E4CA50>>

In [30]:
def embedded(text: str, word2vec_model) -> list[list]:
    embeddings_for_text = []
    for word in text.split():
        if word2vec_model.has_index_for(word):
            embeddings_for_text.append(word2vec_model.get_vector(word))
        else:
            embeddings_for_text.append(np.zeros(word2vec_model.vector_size))
    return np.mean(embeddings_for_text, axis=0)
            
    
    

In [31]:
X_train_prep[0]

'the a you with out themselves powerful let love their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room and it so heart show to year of every never going and help moment or of every chest visual movie except her wa several of enough more with is now current film a you of mine potentially unfortunately of you than him that with out themselves her get for wa camp of you movie sometimes movie that with scary but and to story wonderful that in seeing in character to of musician with heart had shadow they of here that with her serious to have doe when from why what have critic they is you that one will very to a itself with other and in of seen over landed for anyone of and br to whether from than out themselves history he name half some br of and odd wa two most of mean for any an boat she he should is thought frog but of script you not while history he heart to real at barrel but when from one bit then have two of script their with her nobody

In [32]:
X_train_emb = np.array([embedded(words, word2vec_model) for words in tqdm(X_train_prep)])
X_test_emb = np.array([embedded(words, word2vec_model) for words in tqdm(X_test_prep)])

100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:16<00:00, 1502.21it/s]
100%|██████████████████████████████████████████████████████████████████████████| 25000/25000 [00:16<00:00, 1538.28it/s]


In [33]:
X_train_emb.shape

(25000, 300)

In [None]:
"""

Теперь обучим на эмбедингах от word2vec две модели: простую сеть и TextCNN

"""

In [164]:
class SimpleNeuralNet(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNeuralNet, self).__init__()
        # [25000, 300]
        self.fc = nn.Linear(input_dim, 64)
        self.fc1 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.tensor(x, dtype=torch.float32)
        out = self.fc(x)
        out = self.fc1(out)
        out = self.sigmoid(out)
        return out.squeeze()

def calculate_accuracy(predicted, labels):
    threshold = 0.5
    predicted = predicted >= threshold
    correct = (predicted == labels)
    return correct.sum().item() / len(labels)
    
model = SimpleNeuralNet(input_dim=300)

def train_neural_net(X_train, y_train, X_test, y_test):
    
    
    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    
    epochs = 10
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    for epoch in tqdm(range(epochs)):
        model.train()
        optimizer.zero_grad()
        loss_epoch = 0
        pred = model(X_train)
        loss = criterion(pred, y_train)
        loss.backward()
        optimizer.step()
        accuracy_train = calculate_accuracy(pred, y_train)
        print(f'Epochs {epoch}/{epochs}, Loss_train: {loss.item():.4f}, accuracy_train: {accuracy_train}')
        
        if epoch % 2 == 0:
            model.eval()
            with torch.no_grad():
                pred_test = model(X_test)
                loss_test = criterion(pred_test, y_test)
                accuracy_test = calculate_accuracy(pred_test, y_test)
                print(f'Epochs {epoch}/{epochs}, Loss_test: {loss_test.item():.4f}, accuracy_test: {accuracy_test}')
                
                
        
        
train_neural_net(X_train_emb, y_train, X_test_emb, y_test)
    

  x = torch.tensor(x, dtype=torch.float32)
 30%|████████████████████████▉                                                          | 3/10 [00:00<00:00, 26.49it/s]

Epochs 0/10, Loss_train: 0.6931, accuracy_train: 0.50008
Epochs 0/10, Loss_test: 0.6929, accuracy_test: 0.53196
Epochs 1/10, Loss_train: 0.6929, accuracy_train: 0.5378
Epochs 2/10, Loss_train: 0.6926, accuracy_train: 0.57864
Epochs 2/10, Loss_test: 0.6924, accuracy_test: 0.57624
Epochs 3/10, Loss_train: 0.6924, accuracy_train: 0.58292
Epochs 4/10, Loss_train: 0.6921, accuracy_train: 0.5706
Epochs 4/10, Loss_test: 0.6919, accuracy_test: 0.56108
Epochs 5/10, Loss_train: 0.6918, accuracy_train: 0.5672


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 28.23it/s]

Epochs 6/10, Loss_train: 0.6916, accuracy_train: 0.57108
Epochs 6/10, Loss_test: 0.6914, accuracy_test: 0.57412
Epochs 7/10, Loss_train: 0.6913, accuracy_train: 0.58284
Epochs 8/10, Loss_train: 0.6910, accuracy_train: 0.59272
Epochs 8/10, Loss_test: 0.6909, accuracy_test: 0.59264
Epochs 9/10, Loss_train: 0.6907, accuracy_train: 0.5968





In [165]:
torch.save(model.state_dict(), 'model_3.pt')

In [47]:
sum(list(map(lambda tokens: "</s>" in tokens, X_train_prep[:])))

0

In [49]:
X_train_prep[0].split()

['the',
 'a',
 'you',
 'with',
 'out',
 'themselves',
 'powerful',
 'let',
 'love',
 'their',
 'becomes',
 'reaching',
 'had',
 'journalist',
 'of',
 'lot',
 'from',
 'anyone',
 'to',
 'have',
 'after',
 'out',
 'atmosphere',
 'never',
 'more',
 'room',
 'and',
 'it',
 'so',
 'heart',
 'show',
 'to',
 'year',
 'of',
 'every',
 'never',
 'going',
 'and',
 'help',
 'moment',
 'or',
 'of',
 'every',
 'chest',
 'visual',
 'movie',
 'except',
 'her',
 'wa',
 'several',
 'of',
 'enough',
 'more',
 'with',
 'is',
 'now',
 'current',
 'film',
 'a',
 'you',
 'of',
 'mine',
 'potentially',
 'unfortunately',
 'of',
 'you',
 'than',
 'him',
 'that',
 'with',
 'out',
 'themselves',
 'her',
 'get',
 'for',
 'wa',
 'camp',
 'of',
 'you',
 'movie',
 'sometimes',
 'movie',
 'that',
 'with',
 'scary',
 'but',
 'and',
 'to',
 'story',
 'wonderful',
 'that',
 'in',
 'seeing',
 'in',
 'character',
 'to',
 'of',
 'musician',
 'with',
 'heart',
 'had',
 'shadow',
 'they',
 'of',
 'here',
 'that',
 'with',
 '

In [50]:
assert sum([0 for x in X_train_prep if 'a' in x.split()]) == 0

In [54]:
torch.tensor([0.0]).dim()

1

In [137]:
from gensim.models import FastText


In [23]:
import gensim
import gensim.downloader as api

# Загрузка модели русского языка
# model = api.load("fasttext-wiki-news-subwords-300")  # Проверьте наличие поддерживаемых моделей


In [24]:
fasttext_model = api.load("fasttext-wiki-news-subwords-300")

In [37]:
fasttext_model['']

KeyError: "Key 'прит' not present"

In [38]:
from gensim.models import KeyedVectors

# Укажите путь к вашему файлу .vec
vec_file_path = 'wiki-news-300d-1M.vec'

# Загрузка модели
fasttext_vectors = KeyedVectors.load_word2vec_format(vec_file_path, binary=False)

# Проверка загрузки
print(f"Модель загружена. Количество слов: {len(fasttext_vectors.key_to_index)}")

# Получение вектора для конкретного слова
word = 'пример'
if word in fasttext_vectors:
    vector = fasttext_vectors[word]
    print(f"Вектор для слова '{word}':\n{vector}")
else:
    print(f"Слово '{word}' отсутствует в модели.")

# Поиск наиболее похожих слов
similar_words = fasttext_vectors.most_similar('пример', topn=5)
print(f"Наиболее похожие слова на 'пример':\n{similar_words}")


Модель загружена. Количество слов: 999994
Вектор для слова 'пример':
[-4.210e-02 -8.830e-02  5.390e-02 -1.480e-02 -2.000e-03  1.144e-01
 -4.920e-02 -2.400e-03 -1.113e-01  5.610e-02 -2.000e-04 -3.400e-03
 -5.460e-02  6.290e-02 -1.980e-02  2.050e-02  1.450e-02 -1.950e-02
  5.990e-02  1.690e-02 -7.770e-02  2.080e-02 -3.420e-02 -9.720e-02
  8.260e-02  2.730e-02  1.640e-02 -1.470e-02  3.530e-02  9.220e-02
  1.092e-01  1.210e-02  7.500e-03 -1.390e-02 -7.330e-02  1.520e-02
  2.190e-02 -5.300e-03 -7.080e-02  1.150e-02  1.420e-02  3.370e-02
 -6.940e-02  2.100e-02  4.130e-02 -8.810e-02 -6.100e-03 -1.680e-02
 -2.410e-02 -1.360e-02 -5.100e-03 -7.370e-02 -8.415e-01  2.510e-02
 -4.320e-02  9.350e-02  2.160e-02 -4.900e-02  3.670e-02  9.940e-02
  4.690e-02 -6.170e-02 -4.200e-02  8.680e-02  8.060e-02  1.810e-02
  5.140e-02 -2.210e-02  4.250e-02 -7.600e-03  7.190e-02  2.490e-02
  1.310e-02  3.150e-02  2.910e-02  4.300e-03 -1.140e-02 -7.850e-02
  4.470e-02 -8.350e-02  1.590e-02 -7.200e-02  7.300e-03  2.1

In [39]:
import numpy as np

def trim_or_pad(vectors, pad_length, pad_vector):
    """
    Обрезает или дополняет список векторов до заданной длины.
    """
    assert pad_vector.ndim == 1
    vectors = vectors[:pad_length] + [pad_vector] * max(0, pad_length - len(vectors))
    return np.stack(vectors)

def seq_to_emb(text: str, fasttext_vectors, pad_length=100) -> np.ndarray:
    """
    Преобразует текст в последовательность эмбеддингов FastText.
    
    :param text: Входной текст.
    :param fasttext_vectors: Загруженная модель FastText.
    :param pad_length: Длина последовательности после обрезки/дополнения.
    :return: Массив эмбеддингов.
    """
    embeddings_for_text = []
    pad_vector = fasttext_vectors['</s>'] if '</s>' in fasttext_vectors else np.zeros(fasttext_vectors.vector_size)
    
    for word in text.split():
        if word in fasttext_vectors:
            embeddings_for_text.append(fasttext_vectors[word])
        else:
            # Для неизвестных слов можно использовать паддинг или средний вектор
            embeddings_for_text.append(pad_vector)
    
    embedd = trim_or_pad(embeddings_for_text, pad_length, pad_vector)
    return embedd

# Пример использования
example_text = "Это пример текста для преобразования в эмбеддинги."
embedding = seq_to_emb(example_text, fasttext_vectors, pad_length=100)
print(embedding.shape)  # Должен вывести (100, размер_вектора)


(100, 300)


In [None]:
# а это для w2v оставлю

def trim_or_pad(vectors, pad_length, pad_vector):
    assert pad_vector.ndim == 1
    vectors = vectors[:pad_length] + [pad_vector] * max(0, pad_length - len(vectors))
    return np.stack(vectors)

def seq_to_emb(text: str, word2vec_model, pad_length=100) -> list[list]:
    embeddings_for_text = []
    for word in text.split():
        if word2vec_model.has_index_for(word):
            embeddings_for_text.append(word2vec_model.get_vector(word))
    embedd = trim_or_pad(embeddings_for_text, pad_length, word2vec_model.get_vector("</s>"))
    return embedd
            
    
seq_to_emb(X_train_prep[1], word2vec_model, pad_length=100).shape

In [43]:
class DatasetImdbClf(Dataset):
    def __init__(self, texts, labels, pad_length):
        super(DatasetImdbClf, self).__init__()
        self.texts = texts
        self.labels = labels
        self.pad_length = pad_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        text = seq_to_emb(text, fasttext_vectors, pad_length=self.pad_length)
        text = torch.tensor(text, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.float32)
        return text, label

In [44]:
train_dataset = DatasetImdbClf(X_train_prep, y_train, pad_length=100)
test_dataset = DatasetImdbClf(X_test_prep, y_test, pad_length=100)

In [45]:
train_dataset[0][0].shape

torch.Size([100, 300])

In [46]:
batch_size = 32
train_dataloader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
test_dataloader = DataLoader(test_dataset,
                              batch_size=batch_size,
                              shuffle=True)

In [47]:
for batch in train_dataloader:
    print(batch[0].shape)
    break

torch.Size([32, 100, 300])


Всего 6 ядер, по 2 ядра размерами 2,3,4 каждое ядро такого размера будет проходить по 2, 3, 4 слова соответственно

In [48]:
kernel_size = [2, 3, 4] # список размеров фильтров (ядер)
num_channels = 2 # размера канала для выхода ядра

class TextCnn(nn.Module):
    def __init__(self):
        super(TextCnn, self).__init__()
        emb_size = 300
        self.text_filters = nn.ModuleList(
            [nn.Sequential(nn.Conv1d(in_channels=emb_size,
                                    out_channels=num_channels,
                                    kernel_size=kernel_size[i]),
                          nn.ReLU()) for i in range(len(kernel_size))]
        )
        
        self.fc = nn.Linear(in_features=6,
                           out_features=1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # результаты свертки с разными фильтрами
        rv = []
        for f in self.text_filters:
            result = f(x.permute((0, 2, 1)))
            # maxpooling для 3 индекса
            result = torch.max(result, dim=2)[0]
            rv.append(result)

        x = torch.concat(rv, dim=1)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x
        


In [49]:
model = TextCnn()

In [50]:
model = TextCnn()
batch = next(iter(test_dataloader))

# Разделяем входные данные и метки
inputs, labels = batch

# Передаём только входные данные в модель
output = model(inputs)

In [51]:
output.shape

torch.Size([32, 1])

In [52]:
model

TextCnn(
  (text_filters): ModuleList(
    (0): Sequential(
      (0): Conv1d(300, 2, kernel_size=(2,), stride=(1,))
      (1): ReLU()
    )
    (1): Sequential(
      (0): Conv1d(300, 2, kernel_size=(3,), stride=(1,))
      (1): ReLU()
    )
    (2): Sequential(
      (0): Conv1d(300, 2, kernel_size=(4,), stride=(1,))
      (1): ReLU()
    )
  )
  (fc): Linear(in_features=6, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [53]:
for train_data, train_label in train_dataloader:
    print(train_data.shape)
    print(train_label.shape)
    break

torch.Size([32, 100, 300])
torch.Size([32])


In [54]:
import numpy as np

model = TextCnn()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

epochs = 10

def accuracy(predicted, labels):
    threshold = 0.5
    predicted = predicted > 0.5
    correct = (predicted == labels).sum().item()
    return correct / labels.size(0)

def train_loop(model, train_dataloader, test_dataloader):
    for epoch in tqdm(range(epochs)):
        model.train()
        epoch_loss = []
        accuracy_epoch = []
        loss_test = []
        accuracy_test = []
        for train_data, train_label in train_dataloader:
            optimizer.zero_grad()
            pred = model(train_data).squeeze(1)
            loss = criterion(pred, train_label)
            accuracy_epoch.append(accuracy(pred, train_label))
            epoch_loss.append(loss.item())
            loss.backward()
            optimizer.step()
        
        
        print(f"Epoch {epoch+1}/{epochs}, loss_train: {np.mean(epoch_loss):.4f}, accuracy_train: {np.mean(accuracy_epoch):.4f}")
        
        if epoch % 2 == 0:
            model.eval()
            with torch.no_grad():
                for test_data, test_label in test_dataloader:
                    pred = model(test_data).squeeze(1)
                    loss = criterion(pred, test_label)
                    accuracy_test.append(accuracy(pred, test_label))
                    loss_test.append(loss.item())    
        
            print(f"Epoch {epoch+1}/{epochs}, loss_test: {np.mean(loss_test):.4f}, accuracy_test: {np.mean(accuracy_test):.4f}")
                
        
train_loop(model, train_dataloader, test_dataloader)

  _torch_pytree._register_pytree_node(
  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

Epoch 1/10, loss_train: 0.6921, accuracy_train: 0.5323


 10%|████████▎                                                                          | 1/10 [00:36<05:26, 36.24s/it]

Epoch 1/10, loss_test: 0.6904, accuracy_test: 0.5639


 20%|████████████████▌                                                                  | 2/10 [00:56<03:33, 26.66s/it]

Epoch 2/10, loss_train: 0.6881, accuracy_train: 0.5673
Epoch 3/10, loss_train: 0.6842, accuracy_train: 0.5767


 30%|████████████████████████▉                                                          | 3/10 [01:32<03:36, 30.95s/it]

Epoch 3/10, loss_test: 0.6840, accuracy_test: 0.5727


 40%|█████████████████████████████████▏                                                 | 4/10 [01:52<02:39, 26.62s/it]

Epoch 4/10, loss_train: 0.6801, accuracy_train: 0.5847
Epoch 5/10, loss_train: 0.6760, accuracy_train: 0.5960


 50%|█████████████████████████████████████████▌                                         | 5/10 [02:28<02:30, 30.08s/it]

Epoch 5/10, loss_test: 0.6777, accuracy_test: 0.5705


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [02:48<01:46, 26.67s/it]

Epoch 6/10, loss_train: 0.6717, accuracy_train: 0.5987
Epoch 7/10, loss_train: 0.6675, accuracy_train: 0.6063


 70%|██████████████████████████████████████████████████████████                         | 7/10 [03:24<01:29, 29.78s/it]

Epoch 7/10, loss_test: 0.6703, accuracy_test: 0.5979


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [03:44<00:53, 26.70s/it]

Epoch 8/10, loss_train: 0.6633, accuracy_train: 0.6175
Epoch 9/10, loss_train: 0.6590, accuracy_train: 0.6243


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [04:21<00:29, 29.78s/it]

Epoch 9/10, loss_test: 0.6632, accuracy_test: 0.6084


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:42<00:00, 28.23s/it]

Epoch 10/10, loss_train: 0.6543, accuracy_train: 0.6309





In [136]:
torch.save(model.state_dict(), 'model_text.pt')

In [31]:
#Сохранить  эмбединги можно вот так
torch.save(X_test_emb, "X_test_emb.pt")
# torch.save(y_test, "y_test_emb.pt")