In [1]:
import pandas as pd
import numpy as np
import numpy.random as nprnd
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")


def new_rand_user(dim=9364):
    user_new=np.zeros((dim))
    rand=nprnd.randint(dim*0.5, size=10000)
    for i in rand:
        user_new[i]=i+i*0.005
        
    user_new=((user_new-np.min(user_new))/(np.max(user_new)-np.min(user_new)))*10
    return user_new

def get_deviation(tmp_df,item_index):
    overall_mean=np.mean(np.mean(tmp_df))
    user_dev=np.mean(tmp_df.new_user.values)-overall_mean
    item_dev=np.mean(tmp_df.loc[tmp_df.index==item_index].values)-overall_mean
    base_score=overall_mean+user_dev+item_dev
    return base_score

def get_score(top_k,tmp_df,item_index):
    base_score=get_deviation(tmp_df,item_index)
    ratings=np.array([(tmp_df.loc[tmp_df.index==indx].new_user.values[0]-get_deviation(tmp_df,indx)) for indx in top_k.index])
    val=(top_k.values@np.array(ratings))/np.sum(top_k.values)
    score=base_score+val
    return score


def create_train_file(train_file_names):
    Train_final=[pickle.load(open(name,'rb')) for name in train_file_names]
    
    return (Train_final[0]+Train_final[1]+Train_final[2]+Train_final[3]+Train_final[4])/5


def get_online_recommendations(file_names,new_user_text):
    
    
    Train0=create_train_file(file_names).T
    Train0=((Train0-np.min(Train0))/(np.max(Train0)-np.min(Train0)))*10

    #new_user=new_rand_user(user_dim)
    new_user_df=pd.read_csv(new_user_text,sep='\t',header=None)
    #print(new_user_df)
    orig_ind=new_user_df.loc[new_user_df.iloc[:,1].isnull()==False].index
    new_user_df.iloc[:,1].fillna(0,inplace=True) # fill nans of this text file with 0
    new_user_df.iloc[:,1]=((new_user_df.iloc[:,1]-np.min(new_user_df.iloc[:,1]))/(np.max(new_user_df.iloc[:,1])-np.min(new_user_df.iloc[:,1])))*10
    new_user=pd.DataFrame(index=Train0.T.index[:new_user_df.iloc[:,1].values.shape[0]],columns=['new_user'])
    new_user.new_user=new_user_df.iloc[:,1].values
    
    tmp_df=pd.concat([Train0.T,new_user],axis=1)
    
    nonans_df=tmp_df.loc[tmp_df.loc[:,'new_user'].isnull()==False] # df having all values in user column except nan values. 
    

    all_items_index=[]

    for i in (nonans_df.loc[nonans_df.new_user==0].index):
        new_item=nonans_df.loc[nonans_df.new_user==0].loc[i,:].values   #first row of the concatenated matrix where the rating by the new user is missing
        item_index=i                                                #index of this row 
        item_scores=cosine_similarity(nonans_df.loc[nonans_df.new_user!=0],new_item) #cosine similarity between all the rows where the new user has given some rating
                                                                              # and the row where the rating has to be filled
        item_scores=pd.DataFrame(item_scores,index=nonans_df.loc[nonans_df.new_user!=0].index)
        item_scores.columns=['score']
        top_5=item_scores['score'].order(ascending=False)[:5] #top 5 rows with  highest similarity scores
        nonans_df.loc[i,'new_user']=get_score(top_5,nonans_df,item_index)
        all_items_index.append(item_index)
    
    tst=pd.DataFrame(nonans_df.iloc[:,-1]).reindex(new_user.index)
    tst.index=new_user_df.iloc[:,0].values
    tst['books']=''
    tst.books=new_user_df.iloc[:,2].values
    
    for ind in orig_ind:
        tst.iloc[ind,0]=-1
        
    
    top_rec=tst.sort_values('new_user',ascending=False)
    top_rec.iloc[:,0]=((top_rec.iloc[:,0]-np.min(top_rec.iloc[:,0]))/(np.max(top_rec.iloc[:,0])-np.min(top_rec.iloc[:,0])))*10

    
    return top_rec


def read_Books(file_name):
    '''Read the Initial Book Data'''
    cwd=os.getcwd()
    path=cwd+"/"+file_name
    data_frame=pd.read_csv(path,sep=';',encoding='utf-8',error_bad_lines=False)
    return data_frame

def take_input(file_name):
    '''Read From the File, ISBN Per Line (For Online Version only)'''
    isbn = []
    rat = []
    target = open(file_name,mode='r',encoding='utf8')
    lines = target.readlines()
    for line in lines:
        line = line[:-1]
        isbn.append (str(line[:-2]))
        rat.append (int(line[-1]))
    return isbn,rat

def book_data():
    '''Save the Book Data to print Results'''
    Books_data=read_Books('BX-Books.csv')
    books =Books_data.iloc[:,0:3]
    books.columns = ['ISBN','Title','Author']
    books.set_index('ISBN',inplace=True)
    books.to_csv('Books.csv',encoding='utf8')
    
    
def create_inputfile(train_file):
    books = pd.read_csv('Books.csv',encoding='utf8')
    books.set_index('ISBN',inplace=True)
    new_user = pd.DataFrame(np.zeros(train_file.iloc[:,0].shape))
    new_user.index = train_file.index
    new_user.columns =['new_user']
    new_user.iloc[:,:] = np.nan
    random_indices = random.sample(range(504), 70)
    indices = new_user.iloc[random_indices,:].index
    values = np.random.randint(10, size=(len(indices)))
    
    target = open('input.txt',mode='w',encoding='utf8')
    for i in range(len(indices)):
        target.write(str(indices[i])+' '+str(values[i]))
        target.write('\n')
    target.close()


def create_new_user_df(train_file):
    books = pd.read_csv('Books.csv',encoding='utf8')
    books.set_index('ISBN',inplace=True)
    new_user = pd.DataFrame(np.zeros(train_file.iloc[:,0].shape),index=train_file.index)
    new_user.iloc[:,:] = np.nan
    isbn, ratings = take_input('input.txt')
    for i in range(len(isbn)):
        new_user.loc[isbn[i],:] = ratings[i]
    
    train_file['new_user'] = new_user.iloc[:,0]
    return train_file

def get_online_recommendations_1(Train0):
    Train0_new=create_new_user_df(Train0)
    nonands_df=Train0_new.loc[Train0_new.new_user.isnull()==True]
    indices=np.argwhere(np.isnan(Train0.new_user.values)==False) #indices where ratings are present
    Train0_new.fillna(0,inplace=True)
    
    for index in nonands_df.index[:100]:
        new_item=Train0_new.loc[nonands_df.index[1],:].values
        item_scores=cosine_similarity(Train0.loc[Train0.new_user.isnull()==False],new_item)
        item_scores=pd.DataFrame(item_scores,index=Train0.loc[Train0.new_user.isnull()==False].index)
        item_scores.columns=['score']
        top_5=item_scores['score'].order(ascending=False)[:5] #top 5 rows with  highest similarity scores
        Train0_new.loc[index,'new_user']=get_score(top_5,Train0_new,index)
        
    print('Values calculated...preparing dataframe')
    books = pd.read_csv('Books.csv',encoding='utf8')
    books.set_index('ISBN',inplace=True)
    out_series=pd.Series(Train0_new.new_user,index=Train0_new.index)
    for i in indices.ravel():
        out_series.iloc[i]=-1

    title=[]
    author=[]
    isbns=[]
    for index in out_series.order(ascending=False).index[:100]:
        title.append(books.loc[index,'Title'])
        author.append(books.loc[index,'Author'])
        isbns.append(index)
        
    out_df=pd.DataFrame([isbns,list(out_series.order(ascending=False).values[:100]),title,author]).T
    out_df.columns=['isbns','ratings','title','author']

    return out_df



file_names=['Train0Final_Predictions.pkl','Train1Final_Predictions.pkl','Train2Final_Predictions.pkl',
            'Train3Final_Predictions.pkl','Train4Final_Predictions.pkl'] 

In [None]:
train_file=create_train_file(file_names)
get_online_recommendations_1(train_file)#expensive process !! 10 minutes for 1000 calculations !


In [3]:
c=get_online_recommendations(file_names,'new_user.txt')
c

Unnamed: 0,new_user,books
312963009,10.0,Neanderthal: A Novel
312104243,9.291426,"Behold, a Mystery!: A Regency Story"
312995423,9.282026,Digital Fortress : A Thriller
380815923,8.453411,Blackberry Wine : A Novel
385416342,8.202088,The Firm
425098109,8.087594,Thieves of Light (Photon : the Ultimate Game o...
380704587,7.97922,The Ladies of Missalonghi
449202631,7.930718,Danger
517577402,7.926499,Mostly Harmless
590481371,7.895766,I Spy Spooky Night: A Book of Picture Riddles ...
