# Analiza danych z pliku tekstowego Cell_Phones_&_Accessories.txt
***
# 1 Cel zadania
Celem w zadaniu jest analiza danych z pliku tekstowego Cell_Phones_&_Accessories.txt oraz uzyskanie z niego jak największej ilości informacji.
# 2 Propozycje informacji które można uzyskać z tekstu
- Ilu gwiazdkom odpowiada dana recenzja? Pozowli wyeliminować system oceniania za pomocą gwiazdek
- Generowanie tytułu artykułu na podstawie tekstu w nim zawartego.
- 
# Uwaga
Komentarze we fragmentach kodu pisane są w języku angielskim

In [1]:
**Drugim zadaniem będzie stworzenie sieci neuronowej która będzie w stanie na podstawie tekstu recenzji generować będzie jej krótki opis**

**Przed przystąpieniem do uzyskiwania informacji należy wczytać dane do programu. Utwórzmy w tym celu generator kolejnych recenzji**

In [2]:
from __future__ import annotations
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Flatten, Dense, Dropout, LSTM, concatenate
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Model
from tensorflow.keras import Input
import numpy as np
import matplotlib.pyplot as plt
import shutil

import numpy as np
import os
import random

phones_file_name: str
phones_file_name = "Cell_Phones_&_Accessories.txt"
# Just the name of our input file. Now we dont need to remember it 

def parse_data(filename: str):
    """This function creates a generator for the data in the document 
    
    Parameters
    ----------
    filename: str
        A file name we want to read from
    """
    file = open(filename, 'r')
    entry: dict
    entry = {}
    for line in file:
        line = line.strip()
        colonPos = line.find(':')
        if colonPos == -1:
            yield entry
            entry = {}
            continue
        elem_name = line[:colonPos]
        elem_val = line[colonPos+2:]
        entry[elem_name] = elem_val
    yield entry

Stworzenie funkcji która zapewni dane treningowe. W sieci neuronowej mamy 2 wejścia:
1 Wejście tekstu recenzji
2 Wejście aktualnego stanu tytułu

Tytuł recenzji we wczytanym przez nas pliku tekstowym ma $N$ słów i ma on postać:
$[słowo1, słowo2,...,słowoN]$.

Możemy zatem wyodrębnić z niego następujące sekwencje:
$[]$

$[słowo1]$

$[słowo1, słowo2]$

        .
        
        .
        
        .
        
$[słowo1, słowo2,...,słowoN]$

Dla każdej recenzji będziemy mieli zatem $N+1$ możliwych kombinacji sekwencji danych w tytule.


In [3]:
def load_n_data(gen: generator, n: int, attr_data: str,
                attr_labels: str) -> tuple(list, list):
    """This function creates data and labels using generator
    
    This function creates data and labels using generator
    
    Parameters
    ----------
    gen: generator
        We will generate values from this generator
    attr_data: str
        A dictionary key defining a value we want to append to the output list
    attr_labels: str
        A dictionary key defining a value we want to append to the output list
    Returns
    ----------
    list
        A list containing data we were asking for
    list
        A list containing labels we were asking for
    
    """
    out_data: list = []
    out_labels: list = []
    
    for i in range(n):
        next_dict = next(gen)
        if next_dict == {}:
            # Out of data in generator
            return (out_data, out_labels)
        next_value_data: dict = next_dict.get(attr_data)
        # Get the data from our dict
        
        next_value_labels = next_dict.get(attr_labels)    
        while(next_value_data is None or next_value_labels is None):
            # Given attributes not found. Check next
            next_dict = next(gen)
            if next_dict == {}:
                # We are out of data!Just return what we have# Just return what we have
                return (out_data, out_labels)
                
            next_value_data = next_dict.get(attr_data)
            next_value_labels = next_dict.get(attr_labels)-1
        out_data.append(next_value_data)
        out_labels.append(next_value_labels)
    return (out_data, out_labels)

def generate_dataset_titles(gen: generator, attr_data: str, attr_labels: str,
                            train_n: int = 230000, 
                            max_words: int = 20000) -> tuple[tuple[list, list],
                                                  tuple[list, list],
                                                  tuple[list, list]]:   
    """This function creates a dataset for title creation by using a generator
    
    This function creates a dataset for title creation by using a generator
    
    Parameters
    ----------
    gen: generator
        We will generate values from this generator
    attr_data: str
        A dictionary key defining a data we want to append to the output list
    attr_labels: str
        A dictionary key defining a labels we want to append to the output list
    train_n: int
        Defines a size of the data lists
    max_words: int
        Number of unique words in dataset
    Returns
    ----------
    tokenizer:
        A tokenizer used to tokenize reviews
    tuple[list, list, list]
        A tuple of train_review_data, train_title_data, train_labels
    tuple[list, list, list]
        A tuple of test_review_data, test_title_data, test_labels

    """
    # Train data and labels
    str_train_review_data: list = []
    str_train_title_data: list = []    
    str_train_labels: list = []

    out_train_review_data: list = []
    out_train_title_data: list = []   
    out_train_labels: list = []    
    
    # Test data and labels
    str_test_review_data: list = []
    str_test_title_data: list = []
    str_test_title_labels: list = []

    out_test_review_data: list = []
    out_test_title_data: list = []
    out_test_labels: list = []
    
    str_all_data: list = []
    str_all_labels: list = []

    out_all_review_data: list = []
    out_all_title_data: list = []
    out_all_labels: list = [] 
    
    # Load all possible data
    (str_all_data, str_all_labels) = load_n_data(gen, 100000, attr_data, attr_labels)
    
    
    tokenizer = Tokenizer(num_words=max_words)
    # Fit using all the data
    tokenizer.fit_on_texts(str_all_data)
    temp_all_data = tokenizer.texts_to_sequences(str_all_data)
    temp_all_labels = tokenizer.texts_to_sequences(str_all_labels)
    
    # Split data for different inputs and generate correnc output
    for i, review in enumerate(temp_all_data):
        for j, word in enumerate(temp_all_labels[i]):
            out_all_labels.append(word)
            out_all_review_data.append(review)
            
            if j == len(temp_all_labels[i])-1:
                break
            elif j == 0:
                out_all_title_data.append([])
            else:
                new_item: list = out_all_title_data[-1] + [temp_all_labels[i][j-1]]
                out_all_title_data.append(new_item)
            
    # Just to randomize set
    list_all = list(zip(out_all_review_data, out_all_title_data, out_all_labels))
    random.shuffle(list_all)  
    out_all_review_data, out_all_title_data, out_all_labels = zip(*list_all)
    
    
    # Generate train raw data
    out_train_review_data = out_all_review_data[:train_n]
    out_train_title_data = out_all_title_data[:train_n]
    out_train_labels = out_all_labels[:train_n]
    # Generate test raw data
    out_test_review_data = out_all_review_data[train_n:]
    out_test_title_data = out_all_title_data[train_n:] 
    out_test_labels = out_all_labels[train_n:] 
    # Generate validation data. It contains the rest of data
    #out_val_data = out_all_data[train_n+test_n:]
    #out_val_labels = out_all_labels[train_n+test_n:]
        
    return tokenizer, (out_train_review_data, out_train_title_data, out_train_labels), (out_test_review_data, out_test_title_data, out_test_labels)

Przy użyciu funkcji generującej utwórzmy dataset. W zbiorze treningowym znajdzie się 230000 próbek, a w testowym reszta, czyli 24.446

In [4]:
max_words = 20000
max_len_review = 500
max_len_title = 10
gen = parse_data(phones_file_name)
my_tokenizer, (train_review_data, train_title_data, train_labels), (test_review_data, test_title_data, test_labels) = generate_dataset_titles(gen, "review/text", "review/summary", max_words=max_words)

# Just to be sure
print(len(train_review_data))
print(len(train_title_data))
print(len(train_labels))
print(len(test_review_data))
print(len(test_title_data))
print(len(test_labels))

train_review_data = pad_sequences(train_review_data, maxlen = max_len_review)
test_review_data = pad_sequences(test_review_data, maxlen = max_len_review)

train_title_data = pad_sequences(train_title_data, maxlen = max_len_title)
test_title_data = pad_sequences(test_title_data, maxlen = max_len_title)

train_labels = to_categorical(train_labels, num_classes=max_words)
test_labels = to_categorical(test_labels, num_classes=max_words)

230000
230000
230000
24446
24446
24446


Stwórzmy funkcję generującą następne słowo na podstawie rozkładu prawdopodobieństwa wystąpienia go. Większy epsilon to większa losowość działań. Dla 1.0 korzystamy z rozkładu podanego na wejściu

In [5]:
def get_word(predictions:np.array, epsilon:float=0.5):
    predictions = np.log(predictions) / epsilon
    predictions = np.exp(predictions)
    predictions = predictions/ np.sum(predictions)
    pred: float = np.random.multinomial(1, predictions, 1)
    return np.argmax(pred)

Próbujemy przewidzieć następne słowa na podstawie dwóch wejść.

1 Tekst recenzji <br>
2 Słowa które znalazły się już w tytule recenzji i czekają na kompana


In [7]:
try:
    shutil.rmtree('results/generator')
except:
    pass

os.mkdir('results/generator')

callbacks = [TensorBoard(log_dir='results/conv', histogram_freq=1, embeddings_freq=1)]

input_review = Input(shape=(None,), name='review')
input_labels = Input(shape=(None,), name='title')

embedded_review = Embedding(max_words, 128, input_length=max_len_review)(input_review)
embedded_labels = Embedding(max_words, 128, input_length=max_len_title)(input_labels)

lstm_review = LSTM(64, name='lstm_review', dropout=0.2)(embedded_review)
lstm_labels = LSTM(32, name='lstm_labels', dropout=0.2)(embedded_labels)

concat = concatenate([lstm_review, lstm_labels], axis=-1, name='concat')
ans = Dense(max_words, activation='softmax', name='out')(concat)

model_generator = Model([input_review, input_labels], ans)

model_generator.compile(optimizer=RMSprop(lr=0.001), loss='categorical_crossentropy', metrics='accuracy')

plot_model(model_generator, to_file='model_generator.png')



NameError: name 'LSTM' is not defined

In [None]:
history_generator = model_generator.fit({'review': train_review_data,
                                         'title': train_title_data},
                                        train_labels,
                                        validation_split=0.1,
                                        epochs=1,
                                        batch_size=128)

print(train_review_data.shape)
print(model_generator.evaluate({'review': test_review_data, 'title': test_title_data}, test_labels))
                            
acc = history_generator.history['accuracy']
val_acc = history_generator.history['val_accuracy']


plt.plot(range(1, len(acc)+1), acc, 'rx', label='Train acc')
plt.plot(range(1, len(acc)+1), val_acc, 'b', label='Val acc')
plt.title('Train & val accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()                 
model_simple.save_weights("conv.h5")   
