In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tensorflow import keras
from tensorflow.keras import layers, Sequential, Input
from tensorflow.keras.utils import to_categorical

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv', header = 0)
print(df.shape)
print(df.sentiment.value_counts())
df.head()

In [None]:
df = df.sample(frac = 0.2, random_state = 1).reset_index(drop=True)
print(df.shape)
df.head()

In [None]:
df.sentiment.value_counts()

In [None]:
df.isnull().sum()

In [None]:
def update_sentiment(x):
    return int(x.lower() == 'positive')

def clean_review(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    
    tokens = [token.lower() for token in nltk.word_tokenize(text) if not token.lower() in string.punctuation]
    tokens = [token for token in tokens if not token in stopwords]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return " ".join(tokens)

In [None]:
text = df.iloc[1, 0]
print(text)
soup = BeautifulSoup(text, "html.parser")
text = soup.get_text()
print('-'*100)
print(text)
print('-'*100)
tokens = [token.lower() for token in nltk.word_tokenize(text) if not token.lower() in string.punctuation]
tokens = [token for token in tokens if not token in stopwords]
text = " ".join(tokens)
print(text)
print('-'*100)
text = re.sub('\[[^]]*\]', '', text)
print(text)
print('-'*100)
lemmatizer = WordNetLemmatizer()
tokens = nltk.word_tokenize(text)
tokens = [lemmatizer.lemmatize(token) for token in tokens]
text = " ".join(tokens)
print(text)

In [None]:
df['label'] = df['sentiment'].apply(lambda x: update_sentiment(x))
df['clean_review'] = df['review'].apply(lambda x: clean_review(x))
df.head()

In [None]:
y = np.array(df['label'])
print(len(y))

In [None]:
reviews = df.clean_review.values
print(reviews)
print(len(reviews))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews, y, random_state = 20, test_size = .1)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape, X_test.shape)
cant_features = X_train.shape[1]
print(cant_features)

In [None]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [None]:
X_train

In [None]:
def define_model():
    model = Sequential()
    model.add(Input(shape = (cant_features, )))
    model.add(layers.Dense(1000, activation = 'relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(64, activation = 'relu'))
    model.add(layers.Dense(2, activation = 'softmax'))
    
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

In [None]:
model = define_model()
model.summary()

In [None]:
history = model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 3, verbose = 1, batch_size = 32)