In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [4]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [5]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
bookPerUser = defaultdict(list)
userPerBook = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))
    bookPerUser[u].append(b)
    userPerBook[b].append(u)



In [9]:
allbooks = set()
for _, book, _ in allRatings:
    allbooks.add(book)
allbooks = list(allbooks)

In [22]:
ratingsValidBinary = [(user, book, 1) for user, book, _ in ratingsValid]
ratingsTrainBinary = [(user, book, 1) for user, book, _ in ratingsTrain]
for user, book, _ in ratingsValid:
    newBook = random.choice(allbooks)
    while newBook in [b for b, _ in ratingsPerUser[user]]:
        newBook = random.choice(allbooks)
    ratingsValidBinary.append((user, newBook, 0))

for user, book, _ in ratingsTrain:
    newBook = random.choice(allbooks)
    while newBook in [b for b, _ in ratingsPerUser[user]]:
        newBook = random.choice(allbooks)
    ratingsTrainBinary.append((user, newBook, 0))

In [23]:
random.shuffle(ratingsValidBinary)
random.shuffle(ratingsTrainBinary)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold

def compute_jaccard(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

def create_features(ratingsBinary, ratingsPerItem, ratingsPerUser):
    data = []
    max_popularity = max(len(ratingsPerItem[b]) for b in ratingsPerItem)
    
    for user, book, label in ratingsBinary:
        users_for_book = set(u for u, _ in ratingsPerItem[book])
        user_books = set(b for b, _ in ratingsPerUser[user] if b != book)
        
        # Calculate max Jaccard similarity
        maxjaccard = 0
        for b in user_books:
            users_for_b = set(u for u, _ in ratingsPerItem[b])
            jac = compute_jaccard(users_for_b, users_for_book)
            if jac > maxjaccard:
                maxjaccard = jac
        
        # Book popularity
        popularity = len(ratingsPerItem[book])
        
        # User's average book popularity
        user_books_count = len(ratingsPerUser[user])
        
        data.append({
            'user': user,
            'book': book,
            'max_jaccard': maxjaccard,
            'book_popularity': popularity,
            'user_books_count': user_books_count,
            'label': label
        })
    
    return pd.DataFrame(data)



In [26]:
# Create feature DataFrames for training and validation
df_train = create_features(ratingsTrainBinary, ratingsPerItem, ratingsPerUser)
df_valid = create_features(ratingsValidBinary, ratingsPerItem, ratingsPerUser)

# Feature Selection
feature_cols = ['max_jaccard', 'book_popularity', 'user_books_count']

X_train = df_train[feature_cols]
y_train = df_train['label']
X_valid = df_valid[feature_cols]
y_valid = df_valid['label']

# Scaling Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)


In [None]:
def train_and_evaluate(df_train, df_valid):

    # Feature Selection
    feature_cols = ['max_jaccard', 'book_popularity', 'user_books_count']
    
    X_train = df_train[feature_cols]
    y_train = df_train['label']
    X_valid = df_valid[feature_cols]
    y_valid = df_valid['label']
    
    # Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_valid_scaled = scaler.transform(X_valid)
    
    # Initialize Random Forest
    clf = RandomForestClassifier(random_state=42)
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Hyperparameter Tuning
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        cv=kfold,
        n_jobs=-1,
        scoring='roc_auc',
        verbose=2
    )
    
    grid_search.fit(X_train_scaled, y_train)
    best_clf = grid_search.best_estimator_
    
    # Predictions
    y_pred = best_clf.predict(X_valid_scaled)
    y_proba = best_clf.predict_proba(X_valid_scaled)[:,1]
    
    # Evaluation
    accuracy = accuracy_score(y_valid, y_pred)
    auc = roc_auc_score(y_valid, y_proba)
    report = classification_report(y_valid, y_pred)
    
    print(f'Best Model Accuracy: {accuracy:.4f}')
    print(f'Best Model AUC: {auc:.4f}')
    print('Best Model Classification Report:')
    print(report)
    
    return best_clf, scaler


In [None]:
# Assuming you have already defined ratingsTrain, ratingsValidBinary, and ratingsTestBinary

# Train the model using training and validation sets
model, scaler = train_and_evaluate(df_train, df_valid)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [None]:
importances = model.feature_importances_
feature_names = feature_cols
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)
print(feature_importances)