In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel('Assignment 2.xls',)

In [3]:
feature_name = ['baseball', 'economics', 'politics', 'Europe', 'Asia', 'soccer', 'war',
       'security', 'shopping', 'family',]

# Part 1. Build and use a very basic profile

In [4]:
rating = data.iloc[:-7,].drop(['Unnamed: 10','Unnamed: 12','Unnamed: 15'],axis=1)
rating.loc[:,['User 1','User 2']].fillna(0,inplace=True)

In [5]:
user_profiles  = data.iloc[-2:,:-8]

In [6]:
for feature in feature_name:
    user_profiles.loc['User1',feature] = (rating[feature] * rating['User 1']).sum()
    user_profiles.loc['User2',feature] = (rating[feature] * rating['User 2']).sum()

In [7]:
rating['Pred1'] = np.dot(rating.loc[:,feature_name],user_profiles.loc['User1'])
rating['Pred2'] = np.dot(rating.loc[:,feature_name],user_profiles.loc['User2'])

In [8]:
rating[['Pred1','Pred2']].sort_values('Pred1',ascending=False)

Unnamed: 0,Pred1,Pred2
doc16,6.0,-4.0
doc1,4.0,-4.0
doc12,4.0,-4.0
doc9,3.0,-2.0
doc6,3.0,1.0
doc3,2.0,0.0
doc18,1.0,3.0
doc15,0.0,4.0
doc11,0.0,1.0
doc7,-1.0,2.0


In [9]:
rating[rating.Pred2<0].count()

baseball     4
economics    4
politics     4
Europe       4
Asia         4
soccer       4
war          4
security     4
shopping     4
family       4
num-attr     4
User 1       2
User 2       2
Pred1        4
Pred2        4
dtype: int64

# Part 2. Next, let’s treat all articles as having unit weight ...

In [10]:
norm_rating = rating.apply(lambda row:row[feature_name]/np.sqrt(row['num-attr']),axis=1)
norm_user_profiles = user_profiles.copy()

In [11]:
for feature in feature_name:
    norm_user_profiles.loc['User1',feature] = (norm_rating[feature] * rating['User 1']).sum()
    norm_user_profiles.loc['User2',feature] = (norm_rating[feature] * rating['User 2']).sum()
norm_rating['Pred1'] = np.dot(norm_rating.loc[:,feature_name],norm_user_profiles.loc['User1'])
norm_rating['Pred2'] = np.dot(norm_rating.loc[:,feature_name],norm_user_profiles.loc['User2'])

In [12]:
norm_rating[['Pred1','Pred2']].sort_values('Pred1',ascending=False)

Unnamed: 0,Pred1,Pred2
doc16,1.924646,-1.183064
doc6,1.370923,0.336184
doc12,1.333114,-1.227723
doc9,1.132724,-0.724476
doc1,1.009019,-0.845577
doc3,0.711105,0.016294
doc18,0.554695,1.06066
doc15,0.142229,0.949043
doc11,0.044658,0.349628
doc20,-0.081378,1.237718


# Part 3. Finally, let’s consider how common different terms are among our documents …

In [13]:
df = data.iloc[-6:-5,:10]

In [14]:
idf = 1/df

In [15]:
tfidf_rating = norm_rating[feature_name].copy()
tfidf_user_profiles = user_profiles.copy()

In [16]:
for feature in feature_name:
    tfidf_user_profiles.loc['User1',feature] = (tfidf_rating[feature] * rating['User 1']).sum()
    tfidf_user_profiles.loc['User2',feature] = (tfidf_rating[feature] * rating['User 2']).sum()

In [17]:
tfidf_rating['Pred1'] = np.dot(tfidf_rating.loc[:,feature_name],(tfidf_user_profiles.loc['User1']*idf).T)
tfidf_rating['Pred2'] = np.dot(tfidf_rating.loc[:,feature_name],(tfidf_user_profiles.loc['User2']*idf).T)

In [18]:
tfidf_rating.loc['doc9']

baseball     0.000000
economics    0.000000
politics     0.000000
Europe       0.000000
Asia         0.000000
soccer       0.707107
war          0.000000
security     0.000000
shopping     0.707107
family       0.000000
Pred1        0.179067
Pred2       -0.120746
Name: doc9, dtype: float64

In [19]:
idf

Unnamed: 0,baseball,economics,politics,Europe,Asia,soccer,war,security,shopping,family
DF,0.25,0.166667,0.1,0.090909,0.166667,0.166667,0.142857,0.166667,0.142857,0.2
