In [2]:
!pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/b2/3b/47b5eaee01ef2b5a80ba3f7f6ecf79587cb458690857d4777bfd77371c6f/scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata
  Downloading scikit_learn-1.7.1-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Obtaining dependency information for joblib>=1.2.0 from https://files.pythonhosted.org/packages/7d/4f/1195bbac8e0c2acc5f740661631d8d750dc38d4a32b23ee5df3cde6f4e0d/joblib-1.5.1-py3-none-any.whl.metadata
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=3.1.0 from https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.7.1-c


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: C:\Users\raghu\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import gzip
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [11]:
import pandas as pd

# --- Parameters ---
file_path = 'data/Electronics_5.json'
chunk_size = 100000  # Process 100,000 lines at a time
sample_size_per_chunk = 260 # Sample this many reviews from each chunk

list_of_sampled_chunks = []

print("Loading data in chunks to conserve memory...")
# Create an iterator that reads the file in chunks instead of all at once
try:
    with pd.read_json(file_path, lines=True, chunksize=chunk_size) as json_reader:
        for chunk in json_reader:
            # Keep only the columns we need
            chunk_filtered = chunk[['reviewerID', 'asin', 'overall', 'reviewText']]

            # Take a random sample from this chunk to build our final dataset
            list_of_sampled_chunks.append(chunk_filtered.sample(n=sample_size_per_chunk, random_state=42))

except ValueError:
    # This handles if the last chunk is smaller than the sample size
    print("Reached end of file.")

print("Concatenating sampled chunks into a final DataFrame...")
# Combine all the small, sampled chunks into one
df_sample = pd.concat(list_of_sampled_chunks, ignore_index=True)

# Rename columns for consistency with the rest of the project
df_sample.rename(columns={'reviewerID': 'user_id', 'asin': 'item_id', 'overall': 'rating'}, inplace=True)

print(f"Data loaded successfully. Final sample size: {len(df_sample)}")
df_sample.head()
# Add this line at the end of Cell 2
df_sample.to_pickle('saved_models/df_sample.pkl')

Loading data in chunks to conserve memory...
Concatenating sampled chunks into a final DataFrame...
Data loaded successfully. Final sample size: 17680


In [12]:
# Create a TF-IDF matrix directly from the raw review text
# It will automatically handle tokenization and remove common English stop words.
print("Creating TF-IDF matrix from review text...")
tfidf = TfidfVectorizer(
    max_features=5000, 
    stop_words='english',
    ngram_range=(1, 2) # Also consider two-word phrases like "battery life"
)

# Handle potential empty reviews by filling them with an empty string
df_sample['reviewText'].fillna('', inplace=True)
item_features = tfidf.fit_transform(df_sample['reviewText'])

# Save the TF-IDF vectorizer and the item features matrix
pickle.dump(tfidf, open('saved_models/tfidf_vectorizer.pkl', 'wb'))
pickle.dump(item_features, open('saved_models/item_features.pkl', 'wb'))

# Also save the mapping from dataframe index to item_id
item_id_map = df_sample['item_id']
item_id_map.to_pickle('saved_models/item_id_map.pkl')

print("TF-IDF models created and saved successfully.")

Creating TF-IDF matrix from review text...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_sample['reviewText'].fillna('', inplace=True)


TF-IDF models created and saved successfully.


In [13]:
from scipy.sparse import csr_matrix
import numpy as np
import pickle

print("Creating a sparse user-item matrix to conserve memory...")

# Step 1: Create integer mappings for users and items for the sparse matrix
# This is essential because sparse matrices require integer indices.
user_c = pd.Categorical(df_sample['user_id'])
item_c = pd.Categorical(df_sample['item_id'])

# The 'codes' attribute gives us the integer index for each user/item
user_codes = user_c.codes
item_codes = item_c.codes

# Step 2: Create the sparse matrix
# csr_matrix((data, (row_indices, col_indices)), shape)
user_item_sparse_matrix = csr_matrix(
    (df_sample['rating'], (user_codes, item_codes)),
    shape=(len(user_c.categories), len(item_c.categories))
)

# The SVD model can now run on this memory-efficient sparse matrix
print("Training TruncatedSVD model...")
svd = TruncatedSVD(n_components=20, random_state=42)
matrix_decomposed = svd.fit_transform(user_item_sparse_matrix)

# Get the correlation matrix of the decomposed user-feature matrix
corr_matrix = np.corrcoef(matrix_decomposed)

# We need to save the maps from integer codes back to original IDs
user_id_map = {code: user for code, user in enumerate(user_c.categories)}

# Save the necessary objects for the API
pickle.dump(corr_matrix, open('saved_models/corr_matrix.pkl', 'wb'))
pickle.dump(user_id_map, open('saved_models/user_id_map_cf.pkl', 'wb'))

print("Scikit-learn based collaborative filtering models saved.")

Creating a sparse user-item matrix to conserve memory...
Training TruncatedSVD model...
Scikit-learn based collaborative filtering models saved.
