In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re, nltk
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from gensim.models import Doc2Vec
from yellowbrick.cluster import KElbowVisualizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
%matplotlib inline

I have taken only the `review` column from the dataset as I am trying to build a clustering model based on the reviews.

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df = df.drop('sentiment', axis=1)
df.head(5)

In [None]:
df.info()

In [None]:
# Removing html tags from documents
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    return text

df['review'] = df['review'].apply(cleanText)

In [None]:
# Converting text to lowercase
df['review'] = df['review'].apply(lambda x: x.lower()) 

In [None]:
# Removing stopwords
english_stopwords = stopwords.words("english")
df['review'] = df['review'].apply(lambda x: " ".join(x for x in x.split() if x not in english_stopwords))

In [None]:
# Removing non english words and words with length less than 3
words = set(nltk.corpus.words.words())
df['review'] = df['review'].apply(lambda x: " ".join(i for i in nltk.wordpunct_tokenize(x) if i in words and len(i) > 2))

In [None]:
# Transforming words to its root form
lm = WordNetLemmatizer() 
df['review'] = df['review'].apply(lambda x: ' '.join(lm.lemmatize(i) for i in x.split()))

In [None]:
# Replacing everyting else than words and whitespaces with a space.
df['review'] = df['review'].str.replace('[^\w\s]',' ')

In [None]:
# Preparing data fro doc2vec training
LabeledSentence = gensim.models.doc2vec.TaggedDocument
all_content_train = []
j=0
for em in df['review'].values:
    all_content_train.append(LabeledSentence(em,[j]))
    j+=1
print('Number of texts processed: ', j)

In [None]:
# Training doc2vec model
d2v_model = Doc2Vec(all_content_train, vector_size = 100, window = 10, min_count = 500, workers=7, dm = 1,alpha=0.025, min_alpha=0.001)
d2v_model.train(all_content_train, total_examples=d2v_model.corpus_count, epochs=10, start_alpha=0.002, end_alpha=-0.016)

In [None]:
# Determining the number of clusters
wcss = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0)
    kmeans.fit(d2v_model.docvecs.vectors_docs)
    wcss.append(kmeans.inertia_)
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# Training KMeans model 
kmeans_model = KMeans(n_clusters=3, init='k-means++', max_iter=100, n_init=10,random_state=0) 
X = kmeans_model.fit(d2v_model.docvecs.vectors_docs)

In [None]:
# Plotting the clusters
labels = kmeans_model.labels_.tolist()
l = kmeans_model.fit_predict(d2v_model.docvecs.vectors_docs)
pca = PCA(n_components=2).fit(d2v_model.docvecs.vectors_docs)
datapoint = pca.transform(d2v_model.docvecs.vectors_docs)

plt.figure
label1 = ['#FFFF00', '#008000', '#0000FF', '#800080']
color = [label1[i] for i in labels]
plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)
centroids = kmeans_model.cluster_centers_
centroidpoint = pca.transform(centroids)
plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='#000000')
plt.show()

In [None]:
# Saving the model and loading
pickle.dump(kmeans_model, open('model_v1', 'wb'))
loaded_model = pickle.load(open('model_v1', 'rb'))

In [None]:
df['label'] = loaded_model.labels_

In [None]:
# plotting the distribution
df['label'].value_counts().plot(kind='bar', figsize=(15,8))

In [None]:
# Plotting one cluster
label = df[df['label'] == 0]
print(label.shape)
text = ''
for i in label.index:
    text += ' ' + label['review'][i]

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)

# Display the generated image:
plt.figure(figsize = (15,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()