## Adatok letöltése

In [None]:
import pandas as pd
import tensorflow_datasets as tfds
from tqdm import tqdm

# Adathalmaz letöltése
dataset_train = tfds.load('imdb_reviews', split='train', shuffle_files=True)
dataset_test = tfds.load('imdb_reviews', split='test', shuffle_files=True)

In [None]:
def convert_to_df(dataset):
    data = [{ 'text': item['text'].numpy().decode('utf-8'), 'label': item['label'].numpy() } for item in tqdm(dataset)]
    return pd.DataFrame(data)

df_train = convert_to_df(dataset_train)
df_test = convert_to_df(dataset_test)

In [None]:
df_train.hist()

In [None]:
df_test.hist()

In [None]:
def sentiment(value):
    if(value == 1):
        return "positive"
    else:
        return "negative"

df_train["sentiment"] = [None] * len(df_train)
df_train["sentiment"] = df_train["label"].apply(sentiment)
df_test["sentiment"] = [None] * len(df_test)
df_test["sentiment"] = df_test["label"].apply(sentiment)

df_train.head()

## Adattiszítás

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

STOPWORDS = stopwords.words("english")

STOPWORDS[:10]

In [None]:
from bs4 import BeautifulSoup

def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    
    return text

In [None]:
df_train["text"] = df_train["text"].apply(remove_html_tags)

In [None]:
nltk.download('punkt')

tokenized_reviews = df_train["text"].apply(lambda review_text: word_tokenize(review_text.replace("\n","").lower()))

tokenized_reviews.head()

In [None]:
d = dict()

for review in tqdm(tokenized_reviews):
    for word in review:
        if word not in STOPWORDS and word.isalpha():
            d[word] = d.get(word, 0) + 1

In [None]:
d = sorted(d.items(), key=lambda item: item[1], reverse=True)
d[:10]

In [None]:
DESIRED_VOCAB_SIZE = 4000

VOCAB = [k for k,v in d[:DESIRED_VOCAB_SIZE]]
word_table = pd.DataFrame({"word": VOCAB})
word_table.head(10)

## Naive Bayes