In [261]:
import gzip
import math
import numpy
import random
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you like

In [262]:
answers = {}

In [263]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

# Steam Category

### Question 1
We’ll start by building features to represent common words. Start by removing
punctuation and capitalization, and finding the 1,000 most common words across all reviews (‘text’ field)
in the training set. See the ‘text mining’ lectures for code for this process. Report the 10 most common
words, along with their frequencies, as a list of (frequency, word) tuples.

In [264]:
dataset = []

f = gzip.open("steam_category.json.gz")
for l in f:
    d = eval(l)
    dataset.append(d)
    if len(dataset) >= 20000:
        break
        
f.close()

In [265]:
Ntrain = 10000
Ntest = 10000

dataTrain = dataset[:Ntrain]
dataTest = dataset[Ntrain:Ntrain + Ntest]

In [266]:
sp = set(string.punctuation)

In [267]:
# Count word frequencies
word_counts = Counter()
for d in dataTrain:
    if 'text' in d:
        # Remove punctuation, lowercase, and split into words
        text = ''.join(c for c in d['text'] if c not in sp).lower()
        word_counts.update(text.split())

# Identify the 1,000 most common words and take the top 10
most_common_words = word_counts.most_common(1000)
top_10_words = [(float(freq), word) for word, freq in most_common_words[:10]]


In [268]:
answers['Q1'] = top_10_words

In [269]:
assertFloatList([x[0] for x in answers['Q1']], 10)

In [270]:
answers['Q1']

[(34211.0, 'the'),
 (19392.0, 'and'),
 (18791.0, 'a'),
 (18077.0, 'to'),
 (15043.0, 'game'),
 (14095.0, 'of'),
 (13000.0, 'is'),
 (12735.0, 'you'),
 (12204.0, 'i'),
 (11824.0, 'it')]

### Question 2
Build bag-of-words feature vectors by counting the instances of these 1,000 words in each review. Set the
labels (y) to be the ‘genreID’ column for the training instances. You may use these labels directly with
sklearn’s LogisticRegression model, which will automatically perform multiclass classification. Report
performance (accuracy) on your test set.

In [271]:
NW = 1000 # dictionary size

In [272]:
words = [word for word, _ in most_common_words[:NW]]

In [273]:
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_text(text):
	return ''.join(c for c in text if c not in sp).lower()

# Initialize CountVectorizer with the chosen dictionary and build X
vectorizer = CountVectorizer(vocabulary=words)
X = vectorizer.fit_transform(preprocess_text(entry['text']) for entry in dataset if 'text' in entry)
y = [entry['genreID'] for entry in dataset if 'genreID' in entry]

In [274]:
# y = 

In [275]:
Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

In [276]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(Xtrain, ytrain)  # Train the model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [277]:
# Predict on the test set
y_pred = mod.predict(Xtest)

# Calculate the accuracy
correct = (y_pred == ytest)

In [278]:
answers['Q2'] = sum(correct) / len(correct)

In [279]:
assertFloat(answers['Q2'])

In [280]:
answers['Q2']

0.6397

### Question 3
What is the inverse document frequency of the words ‘character’, ‘game’, ‘length’, ‘a’, and ‘it’? What
are their tf-idf scores in the first (training) review (using log base 10, unigrams only, following the first
definition of tf-idf given in the slides)? All frequencies etc. should be calculated using the training data
only. Your answer should be a list of five (idf, tfidf) pairs

In [281]:
import numpy as np

# Define the terms for analysis
terms = ['character', 'game', 'length', 'a', 'it']

# Calculate the IDF for each term
def calculate_idf(term, documents):
    doc_count = sum(1 for doc in documents if term in doc)
    return np.log10(len(documents) / (1 + doc_count))

# Prepare training data for IDF calculations
training_documents = [preprocess_text(entry['text']).split() for entry in dataTrain if 'text' in entry]

# Compute IDF for each term
idf_values = {term: calculate_idf(term, training_documents) for term in terms}

# Compute TF-IDF for the first training review
first_review = training_documents[0]
tf_values = Counter(first_review)
tfidf_scores = {term: tf_values[term] * idf_values.get(term, 0) for term in terms}

# Generate the result as (idf, tfidf) pairs
results = [(idf_values[term], tfidf_scores[term]) for term in terms]

In [282]:
answers['Q3'] = results

In [283]:
assertFloatList([x[0] for x in answers['Q3']], 5)
assertFloatList([x[1] for x in answers['Q3']], 5)

In [284]:
results

[(1.4522252946121774, 1.4522252946121774),
 (0.22944252514900496, 0.4588850502980099),
 (2.2365720064370627, 4.473144012874125),
 (0.30469347756819737, 2.437547820545579),
 (0.3765439519300662, 1.1296318557901985)]

### Question 4
Adapt your unigram model to use the tfidf scores of words, rather than a bag-of-words representation.
That is, rather than your features containing the word counts for the 1000 most common unigrams, it
should contain tfidf scores for the 1000 most common unigrams. Report the accuracy of this new model.

In [285]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn.metrics import accuracy_score

In [286]:
# Build X (TF-IDF features) and y (labels)
tfidf_vectorizer = TfidfVectorizer(vocabulary=words, use_idf=True, smooth_idf=True, norm=None)

In [287]:
X = tfidf_vectorizer.fit_transform(preprocess_text(entry['text']) for entry in dataset if 'text' in entry)
y = [entry['genreID'] for entry in dataset if 'genreID' in entry]  # Insert code here

In [288]:
Xtrain = X[:Ntrain]
ytrain = y[:Ntrain]
Xtest = X[Ntrain:]
ytest = y[Ntrain:]

In [289]:
# Train the logistic regression model
mod = linear_model.LogisticRegression(C=1)
mod.fit(Xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [290]:
# Predict on the test set
y_pred = mod.predict(Xtest)

# Calculate the accuracy
correct = (y_pred == ytest)

In [291]:
answers['Q4'] = sum(correct) / len(correct)

In [292]:
assertFloat(answers['Q4'])

In [293]:
answers['Q4']

0.6143

### Question 5
Which review in the test set the highest cosine similarity compared to the first review in the training set,
in terms of their tf-idf representations (considering unigrams only). Provide the cosine similarity score
and the reviewID.

In [294]:
# Use the given function and follow its constraints
def Cosine(x1, x2):
    # Compute cosine similarity
    return cosine_similarity(x1, x2)[0][0]

similarities = []  # Initialize list to store similarities

for i in range(Xtest.shape[0]):
    sim = Cosine(Xtrain[0], Xtest[i])  # Calculate similarity with the first training review
    similarities.append((sim, i))  # Store similarity score and index

# Sort similarities in descending order
similarities.sort(reverse=True)  # Insert code: No changes allowed here

# Extract the highest similarity score and the corresponding review ID
highest_similarity_score, highest_similarity_index = similarities[0]
highest_similarity_review_id = dataTest[highest_similarity_index].get('reviewID', None)


In [295]:
answers['Q5'] = (highest_similarity_score, highest_similarity_review_id)

In [296]:
assertFloat(answers['Q5'][0])

In [297]:
answers['Q5']

(0.5221153295731461, 'r85353102')

### Question 6
Try to improve upon the performance of the above classifiers from Questions 2 and 4 by using different
dictionary sizes, or changing the regularization constant C passed to the logistic regression model. Report
the performance of your solution.
Use the first half (10,000) of the corpus for training and the rest for testing (code to read the data is
provided in the stub). Process reviews without capitalization or punctuation (and without using
stemming or removing stopwords).

In [298]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# Define the terms for analysis
def preprocess_text(text):
    return ''.join(c for c in text if c not in sp).lower()

In [299]:
# Function to evaluate logistic regression model with given parameters
def evaluate_model(dictionary_size, regularization_constant):
    # Define dictionary size and extract top `dictionary_size` words
    words = [word for word, _ in most_common_words[:dictionary_size]]

    # Build TF-IDF features
    tfidf_vectorizer = TfidfVectorizer(vocabulary=words, use_idf=True, smooth_idf=True, norm=None)
    X = tfidf_vectorizer.fit_transform(preprocess_text(entry['text']) for entry in dataset if 'text' in entry)
    y = [entry['genreID'] for entry in dataset if 'genreID' in entry]

    # Split the data into training and testing sets
    Xtrain = X[:Ntrain]
    ytrain = y[:Ntrain]
    Xtest = X[Ntrain:]
    ytest = y[Ntrain:]

    # Train the logistic regression model
    model = LogisticRegression(C=regularization_constant, max_iter=500)
    model.fit(Xtrain, ytrain)

    # Evaluate accuracy
    y_pred = model.predict(Xtest)
    accuracy = accuracy_score(ytest, y_pred)
    return accuracy


In [300]:
# Define hyperparameter ranges
dictionary_sizes = [500, 1000, 1500, 2000]  # Varying dictionary sizes
regularization_constants = [0.1, 1, 10]     # Varying C values

# Collect results
results = []
for size in dictionary_sizes:
    for c in regularization_constants:
        acc = evaluate_model(dictionary_size=size, regularization_constant=c)
        results.append((size, c, acc))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [301]:
# Organize results into a DataFrame
results_df = pd.DataFrame(results, columns=["Dictionary Size", "C (Regularization)", "Accuracy"])

# Print the best result for reference
best_result = results_df.loc[results_df["Accuracy"].idxmax()]

In [302]:
answers['Q6'] = best_result['Accuracy']

In [303]:
assertFloat(answers['Q6'])

In [304]:
answers['Q6']

0.6246

# GoodReads Young Adult Reviews 

### Question 7
Using the word2vec library in gensim, fit an item2vec model, treating each ‘sentence’ as a temporallyordered2
list of items per user. Use parameters min count=1, size=5, window=3, sg=1.
3 Report the 5 most similar items to the book from the first review along with their similarity scores (your answer can
be the output of the similar by word function).

In [305]:
import gzip
import dateutil.parser
from gensim.models import Word2Vec

In [306]:
dataset = []

f = gzip.open("young_adult_20000.json.gz")
for l in f:
    d = eval(l)
    d['datetime'] = dateutil.parser.parse(d['date_added'])
    dataset.append(d)
    if len(dataset) >= 20000:
        break

In [307]:
# Process dataset into review lists
user_reviews = {}
for entry in dataset:
    user_id = entry['user_id']
    book_id = entry['book_id']
    datetime = entry['datetime']

    if user_id not in user_reviews:
        user_reviews[user_id] = []
    user_reviews[user_id].append((book_id, datetime))

# Create temporally sorted lists of books per user
reviewLists = []
for user_id, books in user_reviews.items():
    sorted_books = sorted(books, key=lambda x: x[1])  # Sort books by datetime
    reviewLists.append([book_id for book_id, _ in sorted_books])

In [308]:
model5 = Word2Vec(reviewLists,
                  min_count=1, # Words/items with fewer instances are discarded
                  vector_size=5, # Model dimensionality
                  window=3, # Window size
                  sg=1) # Skip-gram model

In [309]:
# Find the 5 most similar items to the first book in the dataset
first_book_id = dataset[0]['book_id']
res = model5.wv.most_similar(first_book_id, topn=5)

In [310]:
answers['Q7'] = res[:5]

In [311]:
assertFloatList([x[1] for x in answers['Q7']], 5)

In [312]:
answers['Q7']

[('841237', 0.9962388873100281),
 ('34658929', 0.992466390132904),
 ('10555316', 0.9904285669326782),
 ('16002011', 0.9890600442886353),
 ('13449407', 0.988563597202301)]

In [313]:
f = open("answers_hw4.txt", 'w')
f.write(str(answers) + '\n')
f.close()