In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy
import string
import random
from sklearn import linear_model
import numpy as np
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error

from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler


In [2]:
def readGz(path):
  for l in gzip.open(path, 'rt'):
    yield eval(l)

def readCSV(path):
  f = gzip.open(path, 'rt')
  f.readline()
  for l in f:
    yield l.strip().split(',')

*Part 2: Category Prediction*

In [3]:
catDict = {
  "children": 0,
  "comics_graphic": 1,
  "fantasy_paranormal": 2,
  "mystery_thriller_crime": 3,
  "young_adult": 4
}

genres = ["children", "comics_graphic","fantasy_paranormal","mystery_thriller_crime","young_adult"]

In [4]:
train = list(readGz("train_Category.json.gz"))
len(train)

100000

In [5]:
# split first
train_data, test_data = train_test_split(train, test_size=0.2, random_state=42)

In [6]:
# compute max length ONLY on training set
max_len = max(len(d['review_text']) for d in train_data)

In [16]:
# Step 1: extract features
train_texts = [d['review_text'] for d in train_data]
train_ratings = np.array([d['rating'] for d in train_data]).reshape(-1, 1)
train_votes = np.array([d['n_votes'] for d in train_data]).reshape(-1, 1)
train_length = np.array([len(d['review_text']) / max_len for d in train_data]).reshape(-1, 1)
train_rating_votes  = train_ratings * train_votes
train_votes_length  = train_votes * train_length
train_length_rating = train_length * train_ratings
train_genres = [d['genre'] for d in train_data]

test_texts = [d['review_text'] for d in test_data]
test_ratings = np.array([d['rating'] for d in test_data]).reshape(-1, 1)
test_votes = np.array([d['n_votes'] for d in test_data]).reshape(-1, 1)
test_length = np.array([len(d['review_text']) / max_len for d in test_data]).reshape(-1, 1)
test_rating_votes  = test_ratings * test_votes
test_votes_length  = test_votes * test_length
test_length_rating = test_length * test_ratings
test_genres = [d['genre'] for d in test_data]

# Step 2: build text features (TF-IDF) for train

tfidf = TfidfVectorizer(
    max_features=100000,      # allow large vocab
    min_df=1,                # filter rare words
    max_df=0.5,              # filter super-common words
    ngram_range= (1,1),
    stop_words='english'
)

train_X_text = tfidf.fit_transform(train_texts)
test_X_text = tfidf.transform(test_texts)

In [17]:
X_train_sel = train_X_text
X_test_sel = test_X_text

In [9]:
# Combine numeric feats for train
train_num = np.hstack([
    train_ratings,
    train_votes,
    train_length,
    train_rating_votes,
    train_votes_length,
    train_length_rating
])

# Combine numeric feats for test
test_num = np.hstack([
    test_ratings,
    test_votes,
    test_length,
    test_rating_votes,
    test_votes_length,
    test_length_rating
])


In [10]:
scaler = StandardScaler()

train_num_scaled = scaler.fit_transform(train_num)
test_num_scaled  = scaler.transform(test_num)

In [11]:
train_num_sparse = csr_matrix(train_num_scaled)
test_num_sparse  = csr_matrix(test_num_scaled)

In [18]:
X_train = hstack([X_train_sel, train_num_sparse])
X_test  = hstack([X_test_sel,  test_num_sparse])
y_train = train_genres
y_test = test_genres

In [13]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)


In [36]:
clf = linear_model.LogisticRegression(
        C=2,
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000
    )
clf.fit(X_train, y_train_enc)
test_preds = clf.predict_proba(X_test)
test_pred_labels = le.inverse_transform(np.argmax(test_preds, axis=1))
test_acc  = accuracy_score(y_test, test_pred_labels)




In [37]:
print(test_acc)

0.7714


In [38]:
clf2 = linear_model.LogisticRegression(
        C=2.5,
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000
    )
clf2.fit(X_train, y_train_enc)
test_preds2 = clf2.predict_proba(X_test)
test_pred_labels2 = le.inverse_transform(np.argmax(test_preds2, axis=1))
test_acc2  = accuracy_score(y_test, test_pred_labels2)
print(test_acc2)



0.77245


In [39]:
clf3 = linear_model.LogisticRegression(
        C=2.8,
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000
    )
clf3.fit(X_train, y_train_enc)
test_preds3 = clf3.predict_proba(X_test)
test_pred_labels3 = le.inverse_transform(np.argmax(test_preds3, axis=1))
test_acc3  = accuracy_score(y_test, test_pred_labels3)
print(test_acc3)



0.77265


In [40]:
clf4 = linear_model.LogisticRegression(
        C=3,
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000
    )
clf4.fit(X_train, y_train_enc)
test_preds4 = clf4.predict_proba(X_test)
test_pred_labels4 = le.inverse_transform(np.argmax(test_preds4, axis=1))
test_acc4  = accuracy_score(y_test, test_pred_labels4)
print(test_acc4)



0.77305


In [19]:
clf5 = linear_model.LogisticRegression(
        C=3.3,
        multi_class='multinomial',
        solver='lbfgs',
        max_iter=1000
    )
clf5.fit(X_train, y_train_enc)
test_preds5 = clf5.predict_proba(X_test)
test_pred_labels5 = le.inverse_transform(np.argmax(test_preds5, axis=1))
test_acc5  = accuracy_score(y_test, test_pred_labels5)
print(test_acc5)



0.7733


In [20]:
# ---------------------------------------------------
# 1. Read PREDICTION dataset
# ---------------------------------------------------
prediction_data = list(readGz("test_Category.json.gz"))

# ---------------------------------------------------
# 2. Extract features (using TRAIN parameters like max_len)
# ---------------------------------------------------
pred_texts = [d['review_text'] for d in prediction_data]
pred_ratings = np.array([d['rating'] for d in prediction_data]).reshape(-1, 1)
pred_votes = np.array([d['n_votes'] for d in prediction_data]).reshape(-1, 1)

# normalize using TRAIN max_len
pred_length = np.array([len(d['review_text']) / max_len for d in prediction_data]).reshape(-1, 1)

# interaction features
pred_rating_votes  = pred_ratings * pred_votes
pred_votes_length  = pred_votes * pred_length
pred_length_rating = pred_length * pred_ratings

# ---------------------------------------------------
# 3. TF-IDF transform (DO NOT FIT!)
# ---------------------------------------------------
pred_X_text = tfidf.transform(pred_texts)

# ---------------------------------------------------
# 4. Feature selection (DO NOT FIT!)
# ---------------------------------------------------
X_pred_sel = pred_X_text

# ---------------------------------------------------
# 5. Numeric feature scaling (DO NOT FIT!)
# ---------------------------------------------------
pred_num = np.hstack([
    pred_ratings,
    pred_votes,
    pred_length,
    pred_rating_votes,
    pred_votes_length,
    pred_length_rating
])

pred_num_scaled = scaler.transform(pred_num)
pred_num_sparse = csr_matrix(pred_num_scaled)


# ---------------------------------------------------
# 6. Final prediction feature matrix
# ---------------------------------------------------
X_pred_final = hstack([X_pred_sel, pred_num_sparse])

# ---------------------------------------------------
# 7. Predict
# ---------------------------------------------------
pred_preds = clf5.predict_proba(X_pred_final)

# final predicted class index
pred_labels_enc = np.argmax(pred_preds, axis=1)

# convert index â†’ genre string
pred_labels = le.inverse_transform(pred_labels_enc)

# ---------------------------------------------------
# 8. Write output file
# ---------------------------------------------------
with open("predictions_Category.csv", "w", encoding="utf-8") as f:
    f.write("userID,reviewID,prediction\n")
    for d, genre in zip(prediction_data, pred_labels):
        cat_id = catDict[genre]
        f.write(f"{d['user_id']},{d['review_id']},{cat_id}\n")