In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import date
from dateutil.relativedelta import relativedelta

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [30]:
from surprise import Dataset
from surprise import Reader

In [31]:
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [43]:
df_all=pd.read_csv('/Users/sri.t/Desktop/nsas_logs.csv',sep=',')

In [None]:
df_all.shape

In [None]:
df_all.head()

In [5]:
## Listing all active dataframe in memory
#alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
alldfs = [var for var in dir() if isinstance(eval(var), np.ndarray)]
#alldfs = [var for var in dir() if isinstance(eval(var), defaultdict)]
#alldfs = [var for var in dir() if isinstance(eval(var), list)]
print(alldfs)

[]


### Start with recommendation

In [116]:
# do this exercise for only successful pages
## Limit to pages of high hit
## Is there any users CRM information like country etc
## 
#use low rank matrix factorization (NMF or SVD), it gives latent features 

In [140]:
#reco_x=pd.crosstab(df_all[df_all['dates_unique']>1]['index_y'],df_all[df_all['dates_unique']>1]['index_x'])

In [6]:
reco_x=df_all[df_all['response'].isin([200,302,304])].groupby(['index_y','index_x','dates_unique']).agg({'bytes':'count'}).reset_index()

In [7]:
reco_x.shape

(2130521, 4)

In [8]:
reco_x.columns=['host','url','visits','no_of_views']

In [9]:
reco_x.head()

Unnamed: 0,host,url,visits,no_of_views
0,U1,P1,2,1
1,U1,P10,2,2
2,U1,P11,2,1
3,U1,P12,2,2
4,U1,P13,2,2


In [10]:
reco_x['total_views']=reco_x.groupby('url')['no_of_views'].transform(sum)

In [11]:
reco_x=reco_x.sort_values(by='total_views',ascending=False)

In [12]:
reco_x.head()

Unnamed: 0,host,url,visits,no_of_views,total_views
1176639,U43696,P5,1,1,208798
1691056,U72938,P5,2,1,208798
1054071,U38281,P5,4,2,208798
1691011,U72935,P5,1,1,208798
1054061,U38280,P5,1,1,208798


In [69]:
#reco_x.describe()

In [13]:
reco_x_new=reco_x[reco_x['total_views']>10]

In [14]:
print (reco_x_new.shape)
print (len(reco_x_new['host'].unique()))
print (len(reco_x_new['url'].unique()))
print (reco_x_new['no_of_views'].sum())

(2081791, 5)
137463
4524
3390334


In [15]:
print (reco_x.shape)
print (len(reco_x['host'].unique()))
print (len(reco_x['url'].unique()))
print (reco_x['no_of_views'].sum())

(2130521, 5)
137524
27487
3440364


In [19]:
reco_x_new.max()

host           U99999
url             P9996
visits             58
no_of_views      2549
total_views    208798
dtype: object

In [20]:
reco_x_new[reco_x_new['no_of_views']>100].shape
#only 1100 combination of host and url with greater than 100 views of 1.8M combinations, clipping the views no to ensure views during normalization is not skewed

(1105, 5)

In [21]:
reco_x_new['no_of_views_1']=[100 if x>100 else x for x in reco_x_new['no_of_views']]

In [22]:
reco_x_new.max()

host             U99999
url               P9996
visits               58
no_of_views        2549
total_views      208798
no_of_views_1       100
dtype: object

In [17]:
from sklearn.preprocessing import Normalizer, MinMaxScaler
normal=Normalizer()
MinMax=MinMaxScaler(feature_range=(1, 5))

In [27]:
#Normalize ratings to a scale
reco_x_new['#_views']=MinMax.fit_transform(reco_x_new[['no_of_views_1']])

In [24]:
reco_x_new=reco_x_new.round(decimals=0)

In [28]:
reco_x_new.head()

Unnamed: 0,host,url,visits,no_of_views,total_views,no_of_views_1,#_views
1176639,U43696,P5,1,1,208798,1,1.0
1691056,U72938,P5,2,1,208798,1,1.0
1054071,U38281,P5,4,2,208798,2,1.040404
1691011,U72935,P5,1,1,208798,1,1.0
1054061,U38280,P5,1,1,208798,1,1.0


In [29]:
reco_x_new.max()

host             U99999
url               P9996
visits               58
no_of_views        2549
total_views      208798
no_of_views_1       100
#_views               5
dtype: object

In [56]:
reco_x_new['#_views'].value_counts()

1.0    2067926
2.0       9737
3.0       1951
5.0       1321
4.0        856
Name: #_views, dtype: int64

### Building models

In [33]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(reco_x_new[['host','url','#_views']], reader)

In [34]:
trainset = data.build_full_trainset()

In [62]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.14214190377784416
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [38]:
param_grid = {'n_factors': [20, 50,100],'n_epochs': [10],
              'reg_all': [0.4]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.13959063645422407
{'n_factors': 20, 'n_epochs': 10, 'reg_all': 0.4}


In [39]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
#algo=SVD(n_factors=50,n_epochs=10,lr_all=0.005,reg_all=0.4,verbose=True)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x106685470>

In [None]:
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [1]:
from collections import defaultdict
import json

In [None]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
top_n = get_top_n(predictions, n=10)

In [None]:
with open('recommendations_nasa.txt', 'w') as file:
     file.write(json.dumps(top_n)) 

In [3]:
with open('recommendations_nasa.txt', 'r') as file:
     top_n = json.load(file) 

In [4]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [5]:
n_items = take(2, top_n.items())

In [6]:
n_items

[('U5688',
  [['P1556', 1.16715250305762],
   ['P1554', 1.1281651939619242],
   ['P1549', 1.117106160644256],
   ['P1553', 1.1139117805615035],
   ['P1552', 1.1088558576501535],
   ['P1558', 1.0969182603915852],
   ['P1555', 1.0954908957241392],
   ['P1550', 1.0766938671444497],
   ['P1561', 1.0661092988864755],
   ['P1557', 1.0540778186385955]]),
 ('U19908',
  [['P1551', 1.2596616114824082],
   ['P1557', 1.2335690236378372],
   ['P1558', 1.2056471391825472],
   ['P1555', 1.2034841351427785],
   ['P1549', 1.1896620980280548],
   ['P1552', 1.1498917053705302],
   ['P1556', 1.1495848159047317],
   ['P1550', 1.1476690766753515],
   ['P1554', 1.133208497412244],
   ['P1553', 1.122907021974394]])]

In [265]:
reco_x_new[reco_x_new['host']=='U5688']

Unnamed: 0,host,url,visits,no_of_views,total_views,#_views,no_of_views_1
1443619,U5688,P5,1,1,208798,1.0,1
1443617,U5688,P3,1,1,127916,1.0,1
1443618,U5688,P4,1,1,127082,1.0,1
1443616,U5688,P2,1,1,121580,1.0,1
1443615,U5688,P1,1,1,83918,1.0,1
1443620,U5688,P834,1,1,3719,1.0,1


#### ----------- using scipy svds

In [238]:
R_df = reco_x_new.pivot(index = 'host', columns ='url', values = '#_views').fillna(0)
R_df.head()

In [239]:
R = R_df.as_matrix()
user_ratings_mean = np.mean(R, axis = 1)
R_norm = R - user_ratings_mean.reshape(-1, 1)

In [None]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_norm_norm, k = 50)

In [None]:
sigma = np.diag(sigma)

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [None]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)

In [None]:
def recommend_movies(predictions_df, userID,original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False)
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False)
                 )

    print 'User {0} has already rated {1} movies.'.format(userID, user_full.shape[0])
    print 'Recommending the highest {0} predicted ratings movies not already rated.'.format(num_recommendations)
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies_df[~movies_df['MovieID'].isin(user_full['MovieID'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'MovieID',
               right_on = 'MovieID').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

already_rated, predictions_new = recommend_movies(preds_df, U837,reco_x_new, 10)