In [1]:
import numpy as np
import pandas as pd

In [2]:
options=5
items=5
types=2
ppl_cnt=10

In [3]:
par=np.random.randint(items, size=(options, types))

In [4]:
s = np.random.dirichlet(par[:,0], 1).transpose()
for i in range(1,types):
    s = np.column_stack((s,np.random.dirichlet(par[:,i], 1).transpose()))   

In [5]:
R=np.random.choice(range(1,6),items, p=s[:,0],replace=True)
for i in range(ppl_cnt):
    j=np.random.choice(types,1)
    R=np.column_stack((R,np.random.choice(range(1,6),items, p=s[:,j[0]],replace=True)))
R=R.T

In [6]:
S=R

In [7]:
sparsity=.75
index = np.random.choice(ppl_cnt*items,int(ppl_cnt*items*sparsity),replace=False)

In [8]:
l=np.zeros(len(index))
for x in index:
    i=x//items;j=x%items
    S[i,j]=0

In [9]:
S

array([[0, 0, 5, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 5, 1],
       [1, 1, 1, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 4, 4, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 3, 0],
       [0, 4, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 3, 1, 2, 5]])

In [10]:
mu=np.sum(S)/np.count_nonzero(S)
def fnctn(u,i):
    ru=np.mean(S[u,np.nonzero(S[u,:])]-mu)
    ci=np.mean(S[np.nonzero(S[:,i]),i]-mu-ru)
    return(mu-ru-ci)

In [11]:
fnctn(0,1)

1.8888888888888893

where $S[i,j]=0$ means that the user $i$ has not rated the item $j$. To predict $S[i,j]$, we proceed with an ANOVA style analysis, i.e we consider a mean effect, the effect of being in row $i$ and the effect of being in column $j$. Mathematically we can write,
$$b_{u,i} = \mu + {\bf R}_u + {\bf C}_i,$$
where 
* $\mu$ is the mean rate $$\mu= \frac{\text{np.sum}(S)}{\text{np.count_nonzero}(S)}.$$
* ${\bf R}_u$ is mean effect of being in row $u$, which is $mean(S[u,]-\mu)$, for non-zero elements. To compute ${\bf R}_u$, we first find the index of non-zero elements in $S[u,]$ as follows:
$$\text{nz_ind}=\text{np.nonzero}(S[u,:]).$$
Then the non-zero elements are 
$$\text{nz_el}=S[u,\text{nz_ind}].$$
Then the mean effect of the row $u$ is 
$${\bf R}_u=\text{np.mean}(\text{nz_el}-\mu).$$
* ${\bf C}_i$ is the effect of bein in column $i$, after the previous 2 effects removed
    $${\bf C}_i=np.mean(S[np.nonzero(S[:,i]),i]-\mu-{\bf R}_u)$$

# Content Based Recommender 

In [12]:
#use either np.nan or None for missing values
#None is a python internal type, np.nan is numpy type.
usr = pd.DataFrame(
    {
        'Movie':["Love at least","Romance forever","Cute puppies of love",
                            "Nonstope car chases","Swords vs. karate"], 
        "Alice":[5,5,None,0,0],
        "Bob":[5,None,4,0,0],
        "Carol":[0,None,0,5,5],
        "Dave":[0,0,None,4,None]
    },
    columns=["Movie","Alice","Bob","Carol","Dave"])
usr

Unnamed: 0,Movie,Alice,Bob,Carol,Dave
0,Love at least,5.0,5.0,0.0,0.0
1,Romance forever,5.0,,,0.0
2,Cute puppies of love,,4.0,0.0,
3,Nonstope car chases,0.0,0.0,5.0,4.0
4,Swords vs. karate,0.0,0.0,5.0,


Next, assume each movie has 2 features: Romance and Action. We add these two features to the table.

In [13]:
ftr = pd.DataFrame(
    {
        'Movie':["Romance forever","Love at least",
                 "Cute puppies of love",
                 "Swords vs. karate","Nonstope car chases"],
        "Romance":[.99,.9,1,0,.1], 
        "Action":[.01,0,0,1,1]
    },
    columns=["Movie","Romance","Action"]
)
ftr

Unnamed: 0,Movie,Romance,Action
0,Romance forever,0.99,0.01
1,Love at least,0.9,0.0
2,Cute puppies of love,1.0,0.0
3,Swords vs. karate,0.0,1.0
4,Nonstope car chases,0.1,1.0


### Merge usr and ftr

In [14]:
UF=pd.merge(usr,ftr,on="Movie")
UF

Unnamed: 0,Movie,Alice,Bob,Carol,Dave,Romance,Action
0,Love at least,5.0,5.0,0.0,0.0,0.9,0.0
1,Romance forever,5.0,,,0.0,0.99,0.01
2,Cute puppies of love,,4.0,0.0,,1.0,0.0
3,Nonstope car chases,0.0,0.0,5.0,4.0,0.1,1.0
4,Swords vs. karate,0.0,0.0,5.0,,0.0,1.0


### Select and filter

In [15]:
#1 select column Movie
UF.Movie
#or
#UF['Movie']

0           Love at least
1         Romance forever
2    Cute puppies of love
3     Nonstope car chases
4       Swords vs. karate
Name: Movie, dtype: object

In [16]:
#Select column 3,4,5 and rows bigger than 2
UF.ix[2:,3:6]

Unnamed: 0,Carol,Dave,Romance
2,0.0,,1.0
3,5.0,4.0,0.1
4,5.0,,0.0


In [17]:
#Problem2
#2 select movies with at least 2 rating bigger than 2
#first make a mask. This mask tells us which ratings are bigger than 2
mask=UF.drop(["Movie","Romance","Action"],axis=1)>2
mask

Unnamed: 0,Alice,Bob,Carol,Dave
0,True,True,False,False
1,True,False,False,False
2,False,True,False,False
3,False,False,True,True
4,False,False,True,False


In [18]:
#Problem2-Continued
#see U[mask] tells us which 
print(UF[mask])

  Movie  Alice  Bob  Carol  Dave  Romance  Action
0   NaN    5.0  5.0    NaN   NaN      NaN     NaN
1   NaN    5.0  NaN    NaN   NaN      NaN     NaN
2   NaN    NaN  4.0    NaN   NaN      NaN     NaN
3   NaN    NaN  NaN    5.0   4.0      NaN     NaN
4   NaN    NaN  NaN    5.0   NaN      NaN     NaN


In [19]:
#Problem2-Continued
#column exceed2 in each row, counts how many ratings >2 are there for a movie
UF['exeed2']=UF[mask].count(axis=1)
EXEED2=UF[['Movie','exeed2']]
EXEED2

Unnamed: 0,Movie,exeed2
0,Love at least,2
1,Romance forever,1
2,Cute puppies of love,1
3,Nonstope car chases,2
4,Swords vs. karate,1


In [20]:
#Problem2-Continued[Let me know if you have easier solution]
#Filter out based on the value of the column exeed:
EXEED2[EXEED2['exeed2']>1]

Unnamed: 0,Movie,exeed2
0,Love at least,2
3,Nonstope car chases,2


In [21]:
#Filter based on Romance value of the movie>.5
UF.ix[UF.Romance>.5,'Movie']

0           Love at least
1         Romance forever
2    Cute puppies of love
Name: Movie, dtype: object

### Find average romance feature of all movies

In [22]:
#Average of Romance value of movies (mean of Romance column)
rmean=UF.Romance.mean()
#print rmean
print("mean romance is:",np.round(rmean,3))

r_a_mean=UF[['Romance','Action']].mean()
print("mean romance is:",r_a_mean['Romance'],", and mean action is:",r_a_mean['Action'])
print(UF.drop('Movie', axis=1).sum())

mean romance is: 0.598
mean romance is: 0.598 , and mean action is: 0.402
Alice      10.00
Bob         9.00
Carol      10.00
Dave        4.00
Romance     2.99
Action      2.01
exeed2      7.00
dtype: float64


In [23]:
#You can take the average of all columns for which averaging make sence for them:
UF.drop(["Movie","exeed2"],axis=1).mean()

Alice      2.500000
Bob        2.250000
Carol      2.500000
Dave       1.333333
Romance    0.598000
Action     0.402000
dtype: float64

In [24]:
#find the average; it excludes None values
UF['movie_ave']=UF.ix[:,1:5].mean(axis=1)
UF[['Movie','movie_ave']]

Unnamed: 0,Movie,movie_ave
0,Love at least,2.5
1,Romance forever,2.5
2,Cute puppies of love,2.0
3,Nonstope car chases,2.25
4,Swords vs. karate,1.666667


### Apply a function to each cell in a column

In [25]:
#Capitalize the name of them movie in Moive column
#First define a function that capitalizes strings, using lambda function
f= lambda x: x.upper()
#Use apply function to apply it to each cell in Movie 
UF['Movie']=UF['Movie'].apply(f)
print(UF.Movie)

0           LOVE AT LEAST
1         ROMANCE FOREVER
2    CUTE PUPPIES OF LOVE
3     NONSTOPE CAR CHASES
4       SWORDS VS. KARATE
Name: Movie, dtype: object


## Content Based Recommender Systems

Let $n$ and $N$ denote number of the features and users respectively. For each user $j\in 1,\cdots,N$, there is a parameter $\theta^j\in R^{n+1}$ that can be used to predict user $j$'s rating for a movie with featurs $(r,a)$. Here $r$ is the romance value and $a$ is the action value of the moive. The prediction would be 
$$R=\theta^j.(1,r,a)$$
For example if a user has parametes $(1,10,1)$, then their rating for a movie with Romance and Action value $=(.2,.9)$ would be 
$$R=1+10*(.2)+1*(.9)=3.9$$

What do you think would be $\theta$ for Carol?
${\bf Answer:}$(0,0,5)$

### Singular Value Decomposition

Let $M$ denote a 2-dim array. Dimensions of $M$ are customers and days of week. Each entry is the number of transaction made by a customer at a day. 

In [26]:
M=pd.DataFrame(
{
    "customers":["ABC Inc.","DEF Ltd.","GHI Inc.","KLM Co.","Smith","Johnson","Thompson"],
    "Monday":[1,2,1,5,0,0,5],
    "Tuesday":[1,2,1,5,0,0,5],
    "Wednesday":[1,2,1,5,0,0,5],
    "Thursday":[0,0,0,0,2,3,6],
    "Friday":[0,0,0,0,2,3,6]
    },
    columns=["customers","Monday","Tuesday","Wednesday","Thursday","Friday"]
)
print(M)

  customers  Monday  Tuesday  Wednesday  Thursday  Friday
0  ABC Inc.       1        1          1         0       0
1  DEF Ltd.       2        2          2         0       0
2  GHI Inc.       1        1          1         0       0
3   KLM Co.       5        5          5         0       0
4     Smith       0        0          0         2       2
5   Johnson       0        0          0         3       3
6  Thompson       5        5          5         6       6


* We think of $M$ as a collection of points in a 5 dimensional space. 
* Although we used 5 dimensions to represent customers daily purchases, we can reduce the dimension to 2.
$${\text{dimension 1}}=[1,1,1,0,0]$$
$${\text{dimension 2}}=[0,0,0,1,1]$$
* This holds because all customers purchases is a linear combinations of these two vectors. For example
$${\text{Thompson's purchases}}=5*[1,1,1,0,0]+6*[0,0,0,1,1]$$
* This means that all points in $M$ stay in 2 dimensional subspace of this 5 dimensional space.
* In this new coordinate system We can write $M$ as the follwing:

In [27]:
#In this new coordinate system We can write  MM  as the follwing:
M=pd.DataFrame(
{
    "customers":["ABC Inc.","DEF Ltd.","GHI Inc.","KLM Co.","Smith","Johnson","Thompson"],
    "MTW":[1,2,1,5,0,0,5],
    "ThF":[0,0,0,0,2,3,6],
    },
    columns=["customers","MTW","ThF"]
)
M

Unnamed: 0,customers,MTW,ThF
0,ABC Inc.,1,0
1,DEF Ltd.,2,0
2,GHI Inc.,1,0
3,KLM Co.,5,0
4,Smith,0,2
5,Johnson,0,3
6,Thompson,5,6


Why Reduce dimension?
* Discover correlation 
* Remove Redundant and noisy features
* Interpretation and visualization 
* Easier storage

In [28]:
Z =  [[1,2,1,5,0,0,5],[1,2,1,5,0,0,5],[1,2,1,5,0,0,5],[0,0,0,0,2,3,6],[0,0,0,0,2,3,6]]# generating a random
A=np.array(Z).T
P, D, Q = np.linalg.svd(A, full_matrices=False)
print(P.shape)
print("P=",P)
print("************")
print("D=",D)
print("************")
print("Q=",Q)


(7, 5)
P= [[-0.10002492 -0.12871551  0.98342876  0.01510526 -0.07778762]
 [-0.20004985 -0.25743102 -0.12726031 -0.04717752 -0.93548442]
 [-0.10002492 -0.12871551 -0.04181106  0.95258565  0.01106288]
 [-0.50012462 -0.64357755 -0.10860567 -0.25569486  0.3062806 ]
 [-0.1031237   0.33292725  0.02659731 -0.07479479 -0.07500749]
 [-0.15468555  0.49939088  0.03989596 -0.11219218 -0.11251124]
 [-0.80949573  0.35520421 -0.02881375  0.08102769  0.08125812]]
************
D= [  1.46422028e+01   7.18372445e+00   9.74524070e-16   1.35798063e-16
   2.96551319e-17]
************
Q= [[ -4.88195074e-01  -4.88195074e-01  -4.88195074e-01  -3.77489542e-01
   -3.77489542e-01]
 [ -3.08218921e-01  -3.08218921e-01  -3.08218921e-01   5.97914413e-01
    5.97914413e-01]
 [ -8.15851169e-01   4.36035245e-01   3.79815923e-01  -3.34675269e-17
   -3.34675269e-17]
 [ -0.00000000e+00   7.95585811e-17  -5.56546944e-17   7.07106781e-01
   -7.07106781e-01]
 [  3.24582408e-02   6.90318717e-01  -7.22776958e-01  -4.69358894e-1

In [29]:
X_a = np.dot(np.dot(P, np.diag(D)), Q)
X_a

array([[  1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          7.29249778e-16,   7.26348850e-16],
       [  2.00000000e+00,   2.00000000e+00,   2.00000000e+00,
         -4.09938375e-16,  -4.00878053e-16],
       [  1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
          2.65316363e-16,   8.23747070e-17],
       [  5.00000000e+00,   5.00000000e+00,   5.00000000e+00,
          1.39747095e-15,   1.44657650e-15],
       [  3.61263747e-17,   6.71115973e-17,  -6.39660317e-17,
          2.00000000e+00,   2.00000000e+00],
       [  1.22627911e-16,  -5.29388606e-17,  -2.75106992e-17,
          3.00000000e+00,   3.00000000e+00],
       [  5.00000000e+00,   5.00000000e+00,   5.00000000e+00,
          6.00000000e+00,   6.00000000e+00]])

In [30]:
print(np.std(A), np.std(X_a), np.std(A - X_a))

2.06308665126 2.06308665126 6.43108757214e-16
