In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [91]:
# Load the dataset
df = pd.read_csv('IMDB Dataset.csv')

# Print the first 5 rows of the dataframe
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [92]:
# Define the function to clean the text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = re.sub(r'\W', ' ', text)

    # Remove white spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


# Apply the clean_text function to each review
df['cleaned_review'] = df['review'].apply(clean_text)

# Print the first 5 rows of the dataframe
print(df.head())


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      cleaned_review  
0  One of the other reviewers has mentioned that ...  
1  A wonderful little production The filming tech...  
2  I thought this was a wonderful way to spend ti...  
3  Basically there s a family where a little boy ...  
4  Petter Mattei s Love in the Time of Money is a...  


In [93]:
# Convert the cleaned reviews to lowercase
df['cleaned_review'] = df['cleaned_review'].str.lower()

# Print the first 5 rows of the dataframe
print(df.head())


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      cleaned_review  
0  one of the other reviewers has mentioned that ...  
1  a wonderful little production the filming tech...  
2  i thought this was a wonderful way to spend ti...  
3  basically there s a family where a little boy ...  
4  petter mattei s love in the time of money is a...  


In [94]:
import nltk

# If you haven't downloaded the 'punkt' package, you will need to do it once
nltk.download('punkt')

# Define the function to tokenize the text
def tokenize_text(text):
    return nltk.word_tokenize(text)

# Tokenize the cleaned and lowercased reviews
df['tokenized_review'] = df['cleaned_review'].apply(tokenize_text)

# Print the first 5 rows of the dataframe
print(df.head())


[nltk_data] Downloading package punkt to /Users/samir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

# Get the list of stop words in English
stop_words = set(stopwords.words('english'))

# Define the function to remove stop words
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Remove stop words from the tokenized reviews
df['tokenized_review'] = df['tokenized_review'].apply(remove_stopwords)

# Print the first 5 rows of the dataframe
print(df.head())


[nltk_data] Downloading package stopwords to /Users/samir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production the filming tech...   
2  i thought this was a wonderful way to spend ti...   
3  basically there s a family where a little boy ...   
4  petter mattei s love in the time of money is a...   

                                    tokenized_review  
0  [one, reviewers, mentioned, watching, oz, epis...  
1  [wonderful, little, production, filming, techn...  
2  [thought, wonderful, way, spend, time, hot, su...  
3  [basically, family, little, boy, jake, thin

In [None]:
from nltk.stem import WordNetLemmatizer

# If you haven't downloaded the 'wordnet' package, you will need to do it once
nltk.download('wordnet')

# Create a lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the function to lemmatize words
def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Lemmatize words in the tokenized reviews
df['lemmatized_review'] = df['tokenized_review'].apply(lemmatize_words)

# Print the first 5 rows of the dataframe
print(df.head())



[nltk_data] Downloading package wordnet to /Users/samir/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                      cleaned_review  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production the filming tech...   
2  i thought this was a wonderful way to spend ti...   
3  basically there s a family where a little boy ...   
4  petter mattei s love in the time of money is a...   

                                    tokenized_review  \
0  [one, reviewers, mentioned, watching, oz, epis...   
1  [wonderful, little, production, filming, techn...   
2  [thought, wonderful, way, spend, time, hot, su...   
3  [basically, family, little, boy, jake, 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Step 1: Encoding the Labels
encoder = LabelEncoder()
df['sentiment'] = encoder.fit_transform(df['sentiment'])

# Step 2: Vectorizing the Text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['lemmatized_review'])
sequences = tokenizer.texts_to_sequences(df['lemmatized_review'])
padded_sequences = pad_sequences(sequences, maxlen=200, truncating='post', padding='post')

# Step 3: Splitting the Dataset
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'], test_size=0.2, random_state=42)

# Step 4: Building the Neural Network
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=200))
model.add(LSTM(64, dropout=0.1))
model.add(Dense(1, activation='sigmoid'))

# Step 5: Compiling and Training the Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32)

# Step 6: Evaluating the Model
# Model's performance on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Loss: ", loss)
print("Accuracy: ", accuracy)


In [None]:
# Review Distribution:
df['sentiment'].value_counts()
df['sentiment'].value_counts().plot(kind='bar', title='Distribution of Reviews')
plt.show()


In [None]:
# Review Lengths
df['review_length'] = df['cleaned_review'].apply(len)

plt.figure(figsize=(8,6))
sns.histplot(df['review_length'])
plt.title('Distribution of Review Lengths')
plt.show()


In [None]:
# Word Clouds
from wordcloud import WordCloud

positive_reviews = df[df['sentiment'] == 'positive']
negative_reviews = df[df['sentiment'] == 'negative']

positive_text = ' '.join(positive_reviews['lemmatized_review'].apply(' '.join))
negative_text = ' '.join(negative_reviews['lemmatized_review'].apply(' '.join))

wordcloud_positive = WordCloud(background_color='white').generate(positive_text)
wordcloud_negative = WordCloud(background_color='white').generate(negative_text)

plt.figure(figsize=(10,5))
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.title('Positive Reviews')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.title('Negative Reviews')
plt.axis('off')

plt.show()


In [None]:
# Convert list of lemmatized words back to sentences
df['processed_review'] = df['lemmatized_review'].apply(lambda x: ' '.join(x))

# If your sentiment labels are 'positive' and 'negative', convert them into integers
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split reviews and labels into variables
reviews = df['processed_review'].values
labels = df['sentiment'].values


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.model_selection import train_test_split

# Tokenize the text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
padded_sequences = pad_sequences(sequences, maxlen=200, padding="post", truncating="post")

# Split data into training and testing
train_sequences, val_sequences, train_labels, val_labels = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define the model
model = Sequential([
    Embedding(input_dim=5000, output_dim=16, input_length=200),
    Conv1D(64, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Train the model
history = model.fit(train_sequences, train_labels, epochs=3, validation_data=(val_sequences, val_labels), batch_size=32)
