# Import libraries

In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import calendar

# Get a list of all the month names
month_names = list(calendar.month_name)

In [2]:
month_names[1:]

['January',
 'February',
 'March',
 'April',
 'May',
 'June',
 'July',
 'August',
 'September',
 'October',
 'November',
 'December']

# Help function

In [3]:
def extract_date(text):
    pattern = r'\(([^)]*)\)[^()]*$'
    matches = re.findall(pattern, text)
    if len(matches):
        return matches[0][:4]
    else:
        return np.nan

# Load data

In [4]:
movies_df = pd.read_csv("MovieLenDataset/movies.csv")
ratings_df = pd.read_csv("MovieLenDataset/ratings.csv")
all_data = pd.merge(movies_df, ratings_df, on="movieId")

# Get sorted dataframe for 'userId'

In [18]:
 # get one hot encoding for genres and movieId
items_our_user_rated = (all_data[all_data.userId==86].movieId).unique().tolist()
items_our_user_can_rate = all_data[~all_data.movieId.isin(items_our_user_rated)]["movieId"].unique().tolist()
all_moves_data = movies_df[movies_df.movieId.isin(all_data["movieId"].unique().tolist())]
all_moves_data = all_moves_data.filter(items=["movieId","genres"])
all_moves_data["genres"] = (all_moves_data["genres"].apply(lambda x : str(x).split(sep='|'))).values
dummies_genres = pd.get_dummies(all_moves_data['genres'].apply(pd.Series).stack()).groupby(level=0).sum()
dummies_movie = pd.get_dummies(all_moves_data["movieId"],prefix="movieId")
all_moves_data = pd.concat([all_moves_data,dummies_genres, dummies_movie],axis=1)
all_moves_data = all_moves_data[all_moves_data.movieId.isin(items_our_user_can_rate)]
all_moves_data = all_moves_data.drop(columns=["genres","movieId"])
    

In [24]:
all_moves_data.iloc[:,:20]

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movieId_1
1,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0,1,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9738,0,0,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9739,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9740,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
def prepare_data(userID):
   
    # get id for movies user can rate
    items_our_user_rated = (all_data[all_data.userId==userID].movieId).unique().tolist()
    items_our_user_can_rate = all_data[~all_data.movieId.isin(items_our_user_rated)]["movieId"].unique().tolist()
    user_dataFrame = pd.DataFrame()
   

    # get transaction time from movie rate
    transaction_from_movie_year = datetime.now().year - pd.to_datetime(movies_df[movies_df["movieId"].isin(items_our_user_can_rate)]["title"].apply(extract_date)).dt.year.values
    user_dataFrame["transaction_from_movie_year"] = transaction_from_movie_year


    # get three lag rate
    three_lag_rate = (all_data[all_data["userId"]==userID].sort_values("timestamp")).tail(3)["rating"].values
    user_dataFrame["lag_rate1"] = len(items_our_user_can_rate) * [three_lag_rate[2]]
    user_dataFrame["lag_rate2"] = len(items_our_user_can_rate) * [three_lag_rate[1]]
    user_dataFrame["lag_rate3"] = len(items_our_user_can_rate) * [three_lag_rate[0]]
  

    # get one hot encoding for genres and movieId
    all_moves_data = movies_df[movies_df.movieId.isin(all_data["movieId"].unique().tolist())]
    all_moves_data = all_moves_data.filter(items=["movieId","genres"])
    all_moves_data["genres"] = (all_moves_data["genres"].apply(lambda x : str(x).split(sep='|'))).values
    dummies_genres = pd.get_dummies(all_moves_data['genres'].apply(pd.Series).stack()).groupby(level=0).sum()
    dummies_movie = pd.get_dummies(all_moves_data["movieId"],prefix="movieId")
    all_moves_data = pd.concat([all_moves_data,dummies_genres, dummies_movie],axis=1)
    all_moves_data = all_moves_data[all_moves_data.movieId.isin(items_our_user_can_rate)]
    all_moves_data = all_moves_data.drop(columns=["genres","movieId"])
    

    # get one hot encoding for user id 
    col, row = len(all_data["userId"].unique()), len(items_our_user_can_rate)
    data = np.zeros((row,col))
    all_user_data = pd.DataFrame(data=data, columns=["userId_" + str(x) for x in list(all_data["userId"].unique())])
    all_user_data["userId_" +str(userID)] = 1
    
    # get one hot encoding for month
    col, row = 12, len(items_our_user_can_rate), 
    data = np.zeros((row,col)).astype(str)
    all_month_data = pd.DataFrame(data=data, columns=["transaction_month_"+str(x) for x in month_names[1:]])
    all_month_data["transaction_month_" + datetime.now().strftime("%B")] = 1

    # concate all data
    final_data = pd.concat([user_dataFrame.reset_index(),all_moves_data.iloc[:,:20].reset_index(),all_month_data.reset_index(),all_user_data.reset_index(),all_moves_data.iloc[:,20:].reset_index()],axis=1)
    print(user_dataFrame.shape, all_moves_data.shape, all_user_data.shape)
    
    return (final_data.drop(columns=["index"])).astype("float64")


In [32]:
prepare_data(86)

(9654, 4) (9654, 9744) (9654, 610)


Unnamed: 0,transaction_from_movie_year,lag_rate1,lag_rate2,lag_rate3,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,movieId_193565,movieId_193567,movieId_193571,movieId_193573,movieId_193579,movieId_193581,movieId_193583,movieId_193585,movieId_193587,movieId_193609
0,28.0,4.0,3.5,4.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,28.0,4.0,3.5,4.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,28.0,4.0,3.5,4.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28.0,4.0,3.5,4.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28.0,4.0,3.5,4.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9649,6.0,4.0,3.5,4.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9650,6.0,4.0,3.5,4.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9651,6.0,4.0,3.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9652,5.0,4.0,3.5,4.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [41]:
def predict(userID):
    items_our_user_rated = (all_data[all_data.userId==userID].movieId).unique().tolist()
    items_our_user_can_rate = all_data[~all_data.movieId.isin(items_our_user_rated)]["movieId"].unique().tolist()
    
    # prepare data
    user_data = prepare_data(userID)
    
    # predict from model
    with open('models/xgboost_model.pkl', 'rb') as f:
        model = pickle.load(f)
        
    rate_pred = model.predict(np.array(user_data))
    
    # construct predicted dataframe
    predict_dataframe = pd.DataFrame()
    predict_dataframe["movieId"] = items_our_user_can_rate
    predict_dataframe["rates"] = rate_pred
    
    return predict_dataframe.sort_values("rates",ascending=False)
    

In [42]:
predict(86)

(9654, 4) (9654, 9744) (9654, 610)
  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



Unnamed: 0,movieId,rates
700,932,4.254798
6759,61352,4.195102
4318,6408,4.179895
3855,5490,4.078780
4979,7884,4.064563
...,...,...
3546,4928,2.912129
4380,6554,2.853675
6035,42730,2.837464
3139,4291,2.676944
