# Predict Read

In [1]:
import numpy as np
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
def Acc(pred, y):
    '''
    Calculate accuracy
    
    Arges: pred(list): model prediction
           y(list): ground truth
    
    Returns: Acc(float): Accuracy
    '''
    TP_ = np.logical_and(pred, y)
    FP_ = np.logical_and(pred, np.logical_not(y))
    TN_ = np.logical_and(np.logical_not(pred), np.logical_not(y))
    FN_ = np.logical_and(np.logical_not(pred), y)

    TP = sum(TP_)
    FP = sum(FP_)
    TN = sum(TN_)
    FN = sum(FN_)

    # accuracy
    Acc = (TP + TN) / (TP + FP + TN + FN)
    
    return Acc

In [18]:
class Data_Processor():
    def __init__(self, filename, mode ='val'):
        '''
        Initialization
        '''
        self.data = pd.read_csv(filename)
        self.mode = mode
        self.usersPerBook = defaultdict(set)
        self.booksPerUser = defaultdict(set)
        self.user_Map = defaultdict(int)
        self.book_Map = defaultdict(int)
        self.bookCount = defaultdict(int)
        self.read = set()
        self.totalRead = 0
        
        if mode ==  'val':
            self.train_set = self.data[:190000]
            self.val_set = self.data[190000:]
            self.usersPerBook_train = defaultdict(set)
            self.booksPerUser_train = defaultdict(set)
            
        elif mode == 'test':
            self.train_set = self.data
            
    def preprocessing(self):
        '''
        Creater Dictionarys and Sets for the following tasks
        '''
        u_Count = 0
        b_Count = 0
        count = 0
        for i in self.data.values:
            user, book, r = i
            if self.mode =='val' and count<190000:
                self.usersPerBook_train[book].add(user)
                self.booksPerUser_train[user].add(book)
                
            if user not in self.user_Map:
                self.user_Map[user] = u_Count
                u_Count += 1
            if book not in self.book_Map:
                self.book_Map[book] = b_Count
                b_Count += 1
            self.usersPerBook[book].add(user)
            self.booksPerUser[user].add(book)
            self.read.add((user,book))
            count+=1
            
    def add_negative_samples(self, num, save_filename):
        '''
        Add negative samples into validation set
        
        Args: num(int): how many negative samples should be created
              save_filename(str): the name of save file of validation data
        '''
        non_read= list()
        count = 0

        userID = list(self.booksPerUser_train.keys())
        bookID = list(self.usersPerBook_train.keys())

        while count < num:
            choose_user = random.choice(list(userID))
            choose_book = random.choice(list(bookID))
            pair = (choose_user,choose_book)
            if choose_book not in self.booksPerUser[choose_user]:
                if pair not in non_read:
                    non_read+=[(choose_user,choose_book)]
                    count+=1
                    
        self.save_filename = save_filename        
        with open(self.save_filename,'w+') as f:
            for pos_d in self.val_set.values:
                f.writelines(pos_d[0]+'-'+pos_d[1]+','+'1\n')
            for neg_d in non_read:
                f.writelines(neg_d[0]+'-'+neg_d[1]+','+'0\n')
    
    
    def build_cosine_table(self):
        '''
        Build User-Book interaction matrix
        '''
        table = pd.pivot_table(self.train_set, values='rating', index=['userID'], columns=['bookID'], aggfunc=int)
        table.fillna(0, inplace= True)
        table.replace([1,2,3,4,5],[-1,-1,0,1,1], inplace= True)
        self.users = list(table.index)
        self.books = list(table.columns)

        cosine_sim_user = cosine_similarity(table)
        cosine_sim_book = cosine_similarity(table.T)

        self.c_sim_book = pd.DataFrame(cosine_sim_book, index = self.books, columns = self.books )
        self.c_sim_user = pd.DataFrame(cosine_sim_user, index = self.users, columns = self.users)

  
    def load_gamma(self, user_file_name, book_file_name):
        '''
        Load (bset) Gamma-matrix using one class recommendation algorithm
        '''
        self.gamma_u = np.load(user_file_name)
        self.gamma_b = np.load(book_file_name)
        
    
    def compute_gamma(self, n_epoch=40, latent_factor = 3, threshold = 0.0003, lr = 0.08):
        '''
        Compute gamma matrix using one class recommendation (latent factor)
        '''
    
        self.gamma_u = np.random.normal(scale = 1./latent_factor, size = (len(self.booksPerUser), latent_factor))
        self.gamma_b = np.random.normal(scale = 1./latent_factor, size = (len(self.usersPerBook), latent_factor))
        
        bookID = list(self.usersPerBook.keys())
        dd = self.train_set.values.copy()
        np.random.shuffle(dd)
        old_acc, acc = 0, 0

        for n in range(n_epoch):
            old_acc = acc
            for i in dd:   
                user, book, r = i

                nolook = random.choice(list(bookID))
                while nolook in self.booksPerUser[user]:
                    nolook = random.choice(list(bookID))


                same_factor = np.exp(self.gamma_u[self.user_Map[user],:].dot(self.gamma_b[self.book_Map[nolook],:])\
                                     - self.gamma_u[self.user_Map[user],:].dot(self.gamma_b[self.book_Map[book],:]))
                denom = 1 + same_factor

                tmp =  self.gamma_u[self.user_Map[user],:]
                self.gamma_u[self.user_Map[user],:] -= lr * (self.gamma_b[self.book_Map[nolook],:] - self.gamma_b[self.book_Map[book],:]) * same_factor / denom
                self.gamma_b[self.book_Map[book],:] -= lr * (- tmp) * same_factor / denom
                self.gamma_b[self.book_Map[nolook],:] -= lr * ( tmp) * same_factor / denom


            with open(self.save_filename, 'r') as f:
                count_acc = 0
                for l in f:
                    if l.startswith("userID"):
                        #header
                        continue
                    else:
                        userbook, label = l.strip().split(',')
                        u, b = userbook.split('-')

                        pred = sigmoid(self.gamma_u[self.user_Map[u], :].dot(self.gamma_b[self.book_Map[b], :]))
                        prediction = 1 if pred>=0.5 else 0
                        count_acc += int(prediction == int(label))
                acc = count_acc/ 20000.0
                print('Epoch: %d - Acc: %f' % (n, acc))

            if (old_acc - acc) >= threshold: break
            np.random.shuffle(dd)


        
        
        

In [15]:
filename = "assignment1/train_Interactions.csv.gz"
Read_Predictor = Data_Processor(filename)
Read_Predictor.preprocessing()
Read_Predictor.add_negative_samples(10000, 'assignment1/pairs_readbook_valid.txt')
Read_Predictor.build_cosine_table()
#Read_Predictor.load_gamma('gamma_u.npy','gamma_b.npy')

In [19]:
Read_Predictor.compute_gamma()

Epoch: 0 - Acc: 0.499900
Epoch: 1 - Acc: 0.500150
Epoch: 2 - Acc: 0.504000
Epoch: 3 - Acc: 0.509950
Epoch: 4 - Acc: 0.531900
Epoch: 5 - Acc: 0.565550
Epoch: 6 - Acc: 0.600950
Epoch: 7 - Acc: 0.622250
Epoch: 8 - Acc: 0.639850
Epoch: 9 - Acc: 0.647600
Epoch: 10 - Acc: 0.654100
Epoch: 11 - Acc: 0.659200
Epoch: 12 - Acc: 0.663750
Epoch: 13 - Acc: 0.664450
Epoch: 14 - Acc: 0.667800
Epoch: 15 - Acc: 0.669900
Epoch: 16 - Acc: 0.673700
Epoch: 17 - Acc: 0.672950


# Cosine Similarity

In [20]:
# cosine similarity
def cosine_similarity_book(b, b_other):
    '''
    Return item-based cosine similarity
    
    Args: b(str): validated book
          b_other(str): other books
    '''
    
    if b in Read_Predictor.books:
        return Read_Predictor.c_sim_book.loc[b][b_other]
    else:
        return 0
    
def cosine_similarity_user(u, u_other):
    '''
    Return user-based cosine similarity
    
    Args: u(str): validated user
          u_other(str): other users
    '''
    
    if u in Read_Predictor.users:
        return Read_Predictor.c_sim_user.loc[u][u_other]
    else:
        return 0

# item_based
def cosine_mostSimilar_book(u, b, usersPerBook_, booksPerUser_, decide_on = 1):
    '''
    Giver a pair(user, book), compute the Cosine Similarity betwwen b and b'
    
    Arges: u(string): user
           b(string): book
           decide_on(int): the number of maximum similarities to be considered
                           (default 1)
           usersPerBook_(dict): set of users for each book
           booksPerUser_(dict): set of books for each user
    Returns:
           Avg of biggest similarities with respect to the number of decide_on
    
    '''
    similarities = []
    
    candidateItems = set()
    
    for book in booksPerUser_[u]:
        if book == b: continue
        candidateItems.add(book)

    ## candidateItems: b'
    for i2 in candidateItems:
        sim = cosine_similarity_book(b, i2)
        similarities.append(sim)
    
 
    similarities.sort(reverse=True)
    if decide_on == 1:
        return max(similarities) if similarities else 0 
    else:        
        if len(similarities)<=decide_on:
            return sum(similarities)/len(similarities)  if similarities else 0
        else:
            return sum(similarities[:decide_on])/decide_on
        
# user_based
def cosine_mostSimilar_user(u, b, booksPerUser_, usersPerBook_, decide_on=1):
    '''
    Giver a pair(user, book), compute the Cosine Similarity betwwen u and u'
    
    Arges: u(string): user
           b(string): book
           decide_on(int): the number of maximum similarities to be considered
                           (default 1)
           booksPerUser_(dict): set of books for each user
           usersPerBook_(dict): set of users for each book
    Returns:
           Avg of biggest similarities with respect to the number of decide_on
    
    '''
    similarities = []

    candidateUsers = set()
    
    for user in usersPerBook_[b]:
        if user == u: continue
        candidateUsers.add(user)
        
    ## candidateUsers: u'
    for u2 in candidateUsers:
        sim = cosine_similarity_user(u,u2)
        similarities.append(sim)
        
    similarities.sort(reverse=True)
    if decide_on == 1:
        return max(similarities) if similarities else 0 
    else:        
        if len(similarities)<=decide_on:
            return sum(similarities)/len(similarities)  if similarities else 0
        else:
            return sum(similarities[:decide_on])/decide_on

# Jaccard Similarity

In [21]:
def Jaccard(s1, s2):
    '''
    Calculate Jaccard similarity
    Args: s1, s2 (set): two sets to be calculated
    
    Returns: similarity (float)
    
    '''
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

# item_based
def jaccard_mostSimilar_book(u, b, usersPerBook_, booksPerUser_, decide_on = 1):
    '''
    Giver a pair(user, book), compute the Jaccard Similarity betwwen b and b'
    
    Arges: u(string): user
           b(string): book
           decide_on(int): the number of maximum similarities to be considered
                           (default 1)
           usersPerBook_(dict): set of users for each book
           booksPerUser_(dict): set of books for each user
    Returns:
           Avg of biggest similarities with respect to the number of decide_on
    
    '''
    similarities = []
    users = usersPerBook_[b]
    
    candidateItems = set()
    
    for book in booksPerUser_[u]:
        if book == b: continue
        candidateItems.add(book)

    ## candidateItems: b'
    for i2 in candidateItems:
        if i2 == b: continue
        sim = Jaccard(users, usersPerBook_[i2])
        similarities.append(sim)
    
    similarities.sort(reverse=True)
    if decide_on == 1:
        return max(similarities) if similarities else 0 
    else:        
        if len(similarities)<=decide_on:
            return sum(similarities)/len(similarities)  if similarities else 0
        else:
            return sum(similarities[:decide_on])/decide_on

# user_based
def jaccard_mostSimilar_user(u, b, booksPerUser_, usersPerBook_, decide_on=1):
    '''
    Giver a pair(user, book), compute the Jaccard Similarity betwwen u and u'
    
    Arges: u(string): user
           b(string): book
           decide_on(int): the number of maximum similarities to be considered
                           (default 1)
           booksPerUser_(dict): set of books for each user
           usersPerBook_(dict): set of users for each book
    Returns:
           Avg of biggest similarities with respect to the number of decide_on
    
    '''
    similarities = []
    books = booksPerUser_[u]

    candidateUsers = set()
    for user in usersPerBook_[b]:
        if user == u: continue
        candidateUsers.add(user)
        
    ## candidateUsers: u'
    for u2 in candidateUsers:
        if u2 == u: continue
        sim = Jaccard(books,booksPerUser_[u2])
        similarities.append(sim)
        
    similarities.sort(reverse=True)
    if decide_on == 1:
        return max(similarities) if similarities else 0 
    else:        
        if len(similarities)<=decide_on:
            return sum(similarities)/len(similarities)  if similarities else 0
        else:
            return sum(similarities[:decide_on])/decide_on

In [22]:
import math
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

# Creating each features

In [23]:
with open("assignment1/pairs_readbook_valid.txt", 'r') as f:
    dissimilar = []
    popularity = []
    c_s_book = []
    c_s_user = []
    jar_s_book = []
    jar_s_user = []
    label = []
    for l in f:
        if l.startswith("userID"):
            #header
            continue
        else:
            userbook, lab = l.strip().split(',')
            user, book = userbook.split('-')

            pred = sigmoid(Read_Predictor.gamma_u[Read_Predictor.user_Map[user], :].dot(Read_Predictor.gamma_b[Read_Predictor.book_Map[book], :]))
            dissimilar.append(1 if pred>=0.5 else 0)
        
            # popularity.append(int(book in return_best))  
            count = Read_Predictor.bookCount[book] if Read_Predictor.bookCount[book] else 0
            popularity.append(count)
            
            jar_s_book.append(jaccard_mostSimilar_book(user, book, Read_Predictor.usersPerBook_train, Read_Predictor.booksPerUser_train, 15))
            jar_s_user.append(jaccard_mostSimilar_user(user, book, Read_Predictor.booksPerUser_train, Read_Predictor.usersPerBook_train, 15))
            c_s_book.append(cosine_mostSimilar_book(user, book, Read_Predictor.usersPerBook_train, Read_Predictor.booksPerUser_train, 15))
            c_s_user.append(cosine_mostSimilar_user(user, book, Read_Predictor.booksPerUser_train, Read_Predictor.usersPerBook_train, 15))
            
            label.append(int(lab))

popularity = [np.log(p) if p != 0 else 0 for p in popularity]

# Model Prediction

In [24]:
X = np.column_stack((popularity, jar_s_book, jar_s_user, c_s_book, c_s_user, dissimilar))
Y = np.array((label))

# Shuffle the data
XY = list(zip(X,Y))
random.shuffle(XY)

X = [s[0] for s in XY]
Y = [s[1] for s in XY]

In [25]:
clf1_1 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
clf1_1.fit(X,Y)
clf1_1_score = cross_val_score(clf1_1, X, Y, cv=5) 

clf1_2 = RandomForestClassifier(n_estimators=150, max_depth=5, random_state=0)
clf1_2.fit(X,Y)
clf1_2_score = cross_val_score(clf1_2, X, Y, cv=5)

clf1_3 = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
clf1_3.fit(X,Y) 
clf1_3_score = cross_val_score(clf1_3, X, Y, cv=5)

print('1-1:',clf1_1_score.mean())
print('1-2:',clf1_2_score.mean())
print('1-3:',clf1_3_score.mean())

1-1: 0.7099
1-2: 0.7094500000000001
1-3: 0.7093999999999999


In [26]:
clf2_1 = LogisticRegression(random_state=0, solver='lbfgs',class_weight ='balanced')
clf2_1.fit(X, Y)
clf2_1_score = cross_val_score(clf2_1, X, Y, cv=5)

clf2_2 = LogisticRegression(random_state=0, solver='lbfgs',class_weight ='balanced', C=50)
clf2_2.fit(X, Y)
clf2_2_score = cross_val_score(clf2_2, X, Y, cv=5)

clf2_3 = LogisticRegression(random_state=0, solver='lbfgs',class_weight ='balanced', C=100)
clf2_3.fit(X, Y)
clf2_3_score = cross_val_score(clf2_3, X, Y, cv=5)
 
print('2-1:',clf2_1_score.mean())
print('2-2:',clf2_2_score.mean())
print('2-3:',clf2_3_score.mean())

2-1: 0.67365
2-2: 0.7031499999999999
2-3: 0.7046499999999999


In [27]:
clf3_1 = AdaBoostClassifier(n_estimators=100)
clf3_1.fit(X, Y)
clf3_1_score = cross_val_score(clf3_1, X, Y, cv=5) 

clf3_2 = AdaBoostClassifier(n_estimators=50)
clf3_2.fit(X,Y)
clf3_2_score = cross_val_score(clf3_2, X, Y, cv=5) 

clf3_3 = AdaBoostClassifier(n_estimators=25)
clf3_3.fit(X,Y)
clf3_3_score = cross_val_score(clf3_3, X, Y, cv=5) 

print('3-1:',clf3_1_score.mean())
print('3-2:',clf3_2_score.mean())
print('3-3:',clf3_3_score.mean())

3-1: 0.7077000000000001
3-2: 0.70735
3-3: 0.7072499999999999


In [28]:
clf4_1 = GradientBoostingClassifier(n_estimators=100)
clf4_1.fit(X, Y)
clf4_1_score = cross_val_score(clf4_1, X, Y, cv=5) 

clf4_2 = GradientBoostingClassifier(n_estimators=80)
clf4_2.fit(X, Y)
clf4_2_score = cross_val_score(clf4_2, X, Y, cv=5) 

clf4_3 = GradientBoostingClassifier(n_estimators=60)
clf4_3.fit(X, Y)
clf4_3_score = cross_val_score(clf4_3, X, Y, cv=5) 

print('4-1:',clf4_1_score.mean())
print('4-2:',clf4_1_score.mean())
print('4-3:',clf4_1_score.mean())

4-1: 0.70795
4-2: 0.70795
4-3: 0.70795


# Testing

In [30]:
filename = "assignment1/train_Interactions.csv.gz"
Read_Predictor = Data_Processor(filename, mode='test')
Read_Predictor.preprocessing()
Read_Predictor.build_cosine_table()
Read_Predictor.load_gamma('gamma_u.npy','gamma_b.npy')

In [31]:
with open("assignment1/pairs_Read.txt") as pair_read:
    dissimilar_test = []
    popularity_test = []
    jar_s_book_test = []
    jar_s_user_test = []
    c_s_book_test = []
    c_s_user_test = []
    
    for l in pair_read:
        if l.startswith("userID"):
            continue
        else:
            user, book = l.strip().split('-')
            try:
                pred = sigmoid(Read_Predictor.gamma_u[Read_Predictor.user_Map[user], :]\
                               .dot(Read_Predictor.gamma_b[Read_Predictor.book_Map[book], :]))
                dissimilar_test.append(1 if pred>=0.5 else 0)
            except:
                dissimilar_test.append(0)
            
            count = Read_Predictor.bookCount[book] if Read_Predictor.bookCount[book] else 0
            popularity_test.append(count)
            jar_s_book_test.append(jaccard_mostSimilar_book(user, book, Read_Predictor.usersPerBook, Read_Predictor.booksPerUser, 15))
            jar_s_user_test.append(jaccard_mostSimilar_user(user, book, Read_Predictor.booksPerUser, Read_Predictor.usersPerBook, 15))
            c_s_book_test.append(cosine_mostSimilar_book(user, book, Read_Predictor.usersPerBook, Read_Predictor.booksPerUser, 15))
            c_s_user_test.append(cosine_mostSimilar_user(user, book, Read_Predictor.booksPerUser, Read_Predictor.usersPerBook, 15))
                        

In [32]:
popularity_test = [np.log(p) if p != 0 else 0 for p in popularity_test]
X_test = np.column_stack((popularity_test, jar_s_book_test, jar_s_user_test, c_s_book_test, c_s_user_test, dissimilar_test))

In [33]:
check = []
with open("assignment1/predictions_Read.txt", 'w') as predictions:
    idx = 0
    for l in open("assignment1/pairs_Read.txt"):
        if l.startswith("userID"):
            #header
            predictions.write(l)
            continue
        else:
            user, book = l.strip().split('-')
            test_features = X_test[idx,:].reshape(-1,6)
            
            if clf4_3.predict(test_features)[0] == 1:
                check.append(1)
                predictions.write(user + '-' + book + ",1\n")
            else:
                check.append(0)
                predictions.write(user + '-' + book + ",0\n")
            
            idx +=1

In [34]:
Counter(check)

Counter({0: 9211, 1: 10789})