### Question 1

In [None]:
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import string
import pandas as pd

In [None]:
# Downloading the movie reviews dataset
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Loading the movie reviews
documents = [(movie_reviews.raw(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [None]:
# Extracting the reviews and labels
reviews = [review for review, label in documents]
labels = [1 if label == 'pos' else 0 for review, label in documents]

In [None]:
# Data preprocessing: convert to lowercase, remove punctuation and stop words
stop_words = set(stopwords.words('english'))
preprocessed_reviews = []

In [None]:
for review in reviews:
    # Convert to lowercase
    review = review.lower()

    # Remove punctuation
    review = ''.join([char for char in review if char not in string.punctuation])

    # Tokenize and remove stop words
    tokens = word_tokenize(review)
    review = ' '.join([word for word in tokens if word not in stop_words])

    preprocessed_reviews.append(review)


In [None]:
# Creating a bag of words representation using unigrams
vectorizer_unigrams = CountVectorizer()
X_unigrams = vectorizer_unigrams.fit_transform(preprocessed_reviews)


In [None]:
# Creating a bag of words representation using bigrams
vectorizer_bigrams = CountVectorizer(ngram_range=(2, 2))
X_bigrams = vectorizer_bigrams.fit_transform(preprocessed_reviews)

In [None]:
# Getting the vocabulary for unigrams and bigrams
vocab_unigrams = vectorizer_unigrams.vocabulary_
vocab_bigrams = vectorizer_bigrams.vocabulary_

In [None]:
# Getting feature names by iterating over the vocabulary
feature_names_unigrams = [word for word, idx in sorted(vocab_unigrams.items(), key=lambda x: x[1])]
feature_names_bigrams = [word for word, idx in sorted(vocab_bigrams.items(), key=lambda x: x[1])]


In [None]:
unigrams_df = pd.DataFrame(X_unigrams.toarray(), columns=vectorizer_unigrams.get_feature_names_out())

bigrams_df = pd.DataFrame(X_bigrams.toarray(), columns=vectorizer_bigrams.get_feature_names_out())


In [None]:
print("Unigrams DataFrame:")
unigrams_df.shape

print("\nBigrams DataFrame:")
bigrams_df.shape

Unigrams DataFrame:

Bigrams DataFrame:


(2000, 534933)

In [None]:
unigrams_df.head()

Unnamed: 0,00,000,0009f,000acre,000aweek,000foot,000paltry,007,007esque,00s,...,zuko,zukovsky,zulu,zundel,zurgs,zweibel,zwick,zwicks,zwigoffs,zycie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
bigrams_df.head()

Unnamed: 0,00 cable,00 feet,00 like,00 showing,00 strong,00 sunday,00 task,00 wasnt,000 000,000 bail,...,zwick thinks,zwicks 1994,zwicks courage,zwicks favorite,zwicks latest,zwicks siege,zwigoffs brilliant,zwigoffs superb,zycie masterfully,zycie za
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Question 2

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Creating a TF-IDF representation using unigrams
tfidf_vectorizer_unigrams = TfidfVectorizer()
X_tfidf_unigrams = tfidf_vectorizer_unigrams.fit_transform(preprocessed_reviews)


In [None]:
# Creating a TF-IDF representation using bigrams
tfidf_vectorizer_bigrams = TfidfVectorizer(ngram_range=(2, 2))
X_tfidf_bigrams = tfidf_vectorizer_bigrams.fit_transform(preprocessed_reviews)


In [None]:
vocab_unigrams = tfidf_vectorizer_unigrams.vocabulary_
vocab_bigrams = tfidf_vectorizer_bigrams.vocabulary_

In [None]:
feature_names_unigrams = [word for word, idx in sorted(vocab_unigrams.items(), key=lambda x: x[1])]
feature_names_bigrams = [word for word, idx in sorted(vocab_bigrams.items(), key=lambda x: x[1])]


In [None]:
unigrams_df = pd.DataFrame(X_tfidf_unigrams.toarray(), columns=tfidf_vectorizer_unigrams.get_feature_names_out())

In [None]:
bigrams_df = pd.DataFrame(X_tfidf_bigrams.toarray(), columns=tfidf_vectorizer_bigrams.get_feature_names_out())

In [None]:
print("Unigrams DataFrame:")
unigrams_df.head()

Unigrams DataFrame:


Unnamed: 0,00,000,0009f,000acre,000aweek,000foot,000paltry,007,007esque,00s,...,zuko,zukovsky,zulu,zundel,zurgs,zweibel,zwick,zwicks,zwigoffs,zycie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print("\nBigrams DataFrame:")
bigrams_df.head()


Bigrams DataFrame:


Unnamed: 0,00 cable,00 feet,00 like,00 showing,00 strong,00 sunday,00 task,00 wasnt,000 000,000 bail,...,zwick thinks,zwicks 1994,zwicks courage,zwicks favorite,zwicks latest,zwicks siege,zwigoffs brilliant,zwigoffs superb,zycie masterfully,zycie za
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Question 3

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_unigrams, labels, test_size=0.2, random_state=42)


In [None]:
# Train the Multinomial Naive Bayes classifier using unigrams
nb_classifier_unigrams = MultinomialNB()
nb_classifier_unigrams.fit(X_train, y_train)


In [None]:
# Predictions
y_pred_unigrams = nb_classifier_unigrams.predict(X_test)


In [None]:
# Evaluating the classifier
accuracy_unigrams = accuracy_score(y_test, y_pred_unigrams)
print("Accuracy (Unigrams - TFIDF):", accuracy_unigrams)
print("Classification Report (Unigrams - TFIDF):")
print(classification_report(y_test, y_pred_unigrams))


Accuracy (Unigrams - TFIDF): 0.815
Classification Report (Unigrams - TFIDF):
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       199
           1       0.84      0.78      0.81       201

    accuracy                           0.81       400
   macro avg       0.82      0.82      0.81       400
weighted avg       0.82      0.81      0.81       400



In [None]:
# Repeating the same steps for bigrams
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_bigrams, labels, test_size=0.2, random_state=42)


In [None]:
# Train the Multinomial Naive Bayes classifier using bigrams
nb_classifier_bigrams = MultinomialNB()
nb_classifier_bigrams.fit(X_train, y_train)


In [None]:
# Predictions
y_pred_bigrams = nb_classifier_bigrams.predict(X_test)


In [None]:
# Evaluating the classifier
accuracy_bigrams = accuracy_score(y_test, y_pred_bigrams)
print("Accuracy (Bigrams - TFIDF):", accuracy_bigrams)
print("Classification Report (Bigrams - TFIDF):")
print(classification_report(y_test, y_pred_bigrams))

Accuracy (Bigrams - TFIDF): 0.805
Classification Report (Bigrams - TFIDF):
              precision    recall  f1-score   support

           0       0.76      0.89      0.82       199
           1       0.87      0.72      0.79       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.80       400
weighted avg       0.81      0.81      0.80       400



In [None]:
# Splitting the data into training and testing sets (for bag of words)
X_train_unigrams, X_test_unigrams, y_train_unigrams, y_test_unigrams = train_test_split(X_unigrams, labels, test_size=0.2, random_state=42)
X_train_bigrams, X_test_bigrams, y_train_bigrams, y_test_bigrams = train_test_split(X_bigrams, labels, test_size=0.2, random_state=42)


In [None]:
# Training the Multinomial Naive Bayes classifier using bag of words (unigrams)
nb_classifier_unigrams_bow = MultinomialNB()
nb_classifier_unigrams_bow.fit(X_train_unigrams, y_train_unigrams)


In [None]:
# Predictions (unigrams)
y_pred_unigrams_bow = nb_classifier_unigrams_bow.predict(X_test_unigrams)


In [None]:
# Evaluating the classifier (unigrams)
accuracy_unigrams_bow = accuracy_score(y_test_unigrams, y_pred_unigrams_bow)
print("Accuracy (Unigrams - Bag of Words):", accuracy_unigrams_bow)
print("Classification Report (Unigrams - Bag of Words):")
print(classification_report(y_test_unigrams, y_pred_unigrams_bow))


Accuracy (Unigrams - Bag of Words): 0.8175
Classification Report (Unigrams - Bag of Words):
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       199
           1       0.84      0.79      0.81       201

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [None]:
# Training the Multinomial Naive Bayes classifier using bag of words (bigrams)
nb_classifier_bigrams_bow = MultinomialNB()
nb_classifier_bigrams_bow.fit(X_train_bigrams, y_train_bigrams)


In [None]:
# Predictions (bigrams)
y_pred_bigrams_bow = nb_classifier_bigrams_bow.predict(X_test_bigrams)


In [None]:
# Evaluating the classifier (bigrams)
accuracy_bigrams_bow = accuracy_score(y_test_bigrams, y_pred_bigrams_bow)
print("\nAccuracy (Bigrams - Bag of Words):", accuracy_bigrams_bow)
print("Classification Report (Bigrams - Bag of Words):")
print(classification_report(y_test_bigrams, y_pred_bigrams_bow))


Accuracy (Bigrams - Bag of Words): 0.74
Classification Report (Bigrams - Bag of Words):
              precision    recall  f1-score   support

           0       0.67      0.92      0.78       199
           1       0.88      0.56      0.68       201

    accuracy                           0.74       400
   macro avg       0.78      0.74      0.73       400
weighted avg       0.78      0.74      0.73       400



### Question 4

In [None]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [None]:
# Tokenize the preprocessed reviews into words
tokenized_reviews = [word_tokenize(review) for review in preprocessed_reviews]


In [None]:
# Create a BigramCollocationFinder
bigram_collocations = BigramCollocationFinder.from_documents(tokenized_reviews)


In [None]:
# Calculate PPMI for each bigram
ppmi_scores = bigram_collocations.score_ngrams(BigramAssocMeasures.pmi)


In [None]:
# Sort the bigrams by PPMI score in descending order and get the top 10
top_10_bigrams_ppmi = sorted(ppmi_scores, key=lambda x: -x[1])[:10]


In [None]:
# Display the top 10 bigrams and their PPMI scores
print("Top 10 Bigram Collocations based on PPMI:")
for bigram, pmi_score in top_10_bigrams_ppmi:
    print(' '.join(bigram), f'(PPMI Score: {pmi_score:.5f})')

Top 10 Bigram Collocations based on PPMI:
05425 keywords (PPMI Score: 19.43783)
122 braxtons (PPMI Score: 19.43783)
1272 1305 (PPMI Score: 19.43783)
165 mph (PPMI Score: 19.43783)
1812 overture (PPMI Score: 19.43783)
18foothigh 43footlong (PPMI Score: 19.43783)
1926 novella (PPMI Score: 19.43783)
1972s shaftsbigscore (PPMI Score: 19.43783)
1973s shaftinafrica (PPMI Score: 19.43783)
1hr 40mins (PPMI Score: 19.43783)


### Question 5

In [None]:
from __future__ import print_function
import pyLDAvis


In [None]:
import pyLDAvis.lda_model
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Load 20 newsgroups dataset and filter by specified categories
categories = ['sci.med', 'sci.space', 'talk.politics.guns']
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), categories=categories)
docs_raw = newsgroups.data
print(len(docs_raw))

1733


In [None]:
# Print target names
from pprint import pprint
pprint(list(newsgroups.target_names))

['sci.med', 'sci.space', 'talk.politics.guns']


In [None]:
# Convert to document-term matrix
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                stop_words='english',
                                lowercase=True,
                                token_pattern=r'\b[a-zA-Z]{3,}\b',
                                max_df=0.5,
                                min_df=10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)

(1733, 2589)


In [None]:
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)

(1733, 2589)




In [None]:
# Fit Latent Dirichlet Allocation models
lda_tf = LatentDirichletAllocation(n_components=20, random_state=0,
                                   batch_size=128, doc_topic_prior=None,
                                   evaluate_every=-1, learning_decay=0.7,
                                   learning_method='online', learning_offset=10.0,
                                   max_doc_update_iter=100, max_iter=10,
                                   mean_change_tol=0.001, n_jobs=1,
                                   perp_tol=0.1, topic_word_prior=None,
                                   total_samples=1000000.0, verbose=0)
lda_tf.fit(dtm_tf)


In [None]:
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0,
                                       batch_size=128, doc_topic_prior=None,
                                       evaluate_every=-1, learning_decay=0.7,
                                       learning_method='online', learning_offset=10.0,
                                       max_doc_update_iter=100, max_iter=10,
                                       mean_change_tol=0.001, n_jobs=1,
                                       perp_tol=0.1, topic_word_prior=None,
                                       total_samples=1000000.0, verbose=0)
lda_tfidf.fit(dtm_tfidf)

In [None]:
# Visualizing the models with pyLDAvis, using the term frequency (tf) vectors
pyLDAvis.lda_model.prepare(lda_tf, dtm_tf, tf_vectorizer)

In [None]:
# Visualizing the models with pyLDAvis, using the tf-idf vectors
pyLDAvis.lda_model.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)