# import libraries

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def calc_RMSPE(y_true, y_pred):
    pow2 = ((y_true - y_pred) / y_true)**2
    return np.sqrt(np.mean(pow2))

# load data

In [3]:
movies_df = pd.read_csv("/kaggle/input/datarecommender/movies_len.csv")
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings_df = pd.read_csv("/kaggle/input/datarecommender/ratings.csv")
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# Data cleaning and prepration

In [5]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


### extract date from tittle

In [6]:
def extract_date(text):
    pattern = r'\(([^)]*)\)[^()]*$'
    matches = re.findall(pattern, text)
    if len(matches):
        return matches[0][:4]
    else:
        return np.nan

In [7]:
movies_df["movie_date"] = movies_df["title"].apply(extract_date)
movies_df["movie_date"] = pd.to_datetime(movies_df["movie_date"]).dt.year

In [8]:
movies_df["movie_date"].dtype

dtype('float64')

In [9]:
movies_df["genres"] = movies_df["genres"].apply(lambda x : str(x).split(sep='|'))

### merge two data frame and convert timestamp to datatime

In [10]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [11]:
final_data = pd.merge(movies_df, ratings_df, on="movieId")

In [12]:
final_data.columns

Index(['movieId', 'title', 'genres', 'movie_date', 'userId', 'rating',
       'timestamp'],
      dtype='object')

In [13]:
final_data["timestamp"] = pd.to_datetime(final_data['timestamp'], unit='s')

In [14]:
final_data["transaction_month"] = final_data["timestamp"].dt.month_name()

In [15]:
final_data.head()

Unnamed: 0,movieId,title,genres,movie_date,userId,rating,timestamp,transaction_month
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,1,4.0,2000-07-30 18:45:03,July
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,5,4.0,1996-11-08 06:36:02,November
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,7,4.5,2005-01-25 06:52:26,January
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,15,2.5,2017-11-13 12:59:30,November
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,17,4.5,2011-05-18 05:28:03,May


In [16]:
final_data["movie_date"].min()

1902.0

In [17]:
final_data["movie_date"].max()

2018.0

In [18]:
bins = list(range(1900,2030,10))
labels = [str(bin) + "s" for bin in bins]

In [19]:
# Use pd.cut() to bin the years column
final_data['movie_date_bin'] = pd.cut(final_data["movie_date"], bins,labels=labels[:-1])

In [20]:
final_data.head()

Unnamed: 0,movieId,title,genres,movie_date,userId,rating,timestamp,transaction_month,movie_date_bin
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,1,4.0,2000-07-30 18:45:03,July,1990s
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,5,4.0,1996-11-08 06:36:02,November,1990s
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,7,4.5,2005-01-25 06:52:26,January,1990s
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,15,2.5,2017-11-13 12:59:30,November,1990s
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,17,4.5,2011-05-18 05:28:03,May,1990s


### create cloumn represent order of transaction from time stamp

In [22]:
final_data['transaction_from_movie_year'] = final_data["timestamp"].dt.year - final_data["movie_date"]

In [23]:
final_data.head()

Unnamed: 0,movieId,title,genres,movie_date,userId,rating,timestamp,transaction_month,movie_date_bin,transaction_from_movie_year
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,1,4.0,2000-07-30 18:45:03,July,1990s,5.0
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,5,4.0,1996-11-08 06:36:02,November,1990s,1.0
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,7,4.5,2005-01-25 06:52:26,January,1990s,10.0
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,15,2.5,2017-11-13 12:59:30,November,1990s,22.0
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,17,4.5,2011-05-18 05:28:03,May,1990s,16.0


# get last transaction movie history dataframe

In [24]:
# Sort the dataframe by timestamp in ascending order
df_copy_sorted = final_data.sort_values(by='timestamp', ascending=True)

# Group the dataframe by user ID and shift the 'movieId' column by one row
def shift_movie_id(group):
    group['prev_movieId'] = group['movieId'].shift(1)
    return group

df_copy_sorted = df_copy_sorted.groupby('userId').apply(shift_movie_id)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_copy_sorted = df_copy_sorted.groupby('userId').apply(shift_movie_id)


In [25]:
df_copy_sorted.head()

Unnamed: 0,movieId,title,genres,movie_date,userId,rating,timestamp,transaction_month,movie_date_bin,transaction_from_movie_year,prev_movieId
15993,590,Dances with Wolves (1990),"[Adventure, Drama, Western]",1990.0,429,5.0,1996-03-29 18:36:55,March,1980s,6.0,
5936,222,Circle of Friends (1995),"[Drama, Romance]",1995.0,429,4.0,1996-03-29 18:36:55,March,1990s,1.0,590.0
12093,434,Cliffhanger (1993),"[Action, Adventure, Thriller]",1993.0,429,4.0,1996-03-29 18:36:55,March,1990s,3.0,222.0
16167,592,Batman (1989),"[Action, Crime, Thriller]",1989.0,429,5.0,1996-03-29 18:36:55,March,1980s,7.0,434.0
6119,225,Disclosure (1994),"[Drama, Thriller]",1994.0,429,4.0,1996-03-29 18:36:55,March,1990s,2.0,592.0


# Data for XGboost

In [26]:
data = df_copy_sorted.copy()

In [27]:
data.index = data["timestamp"]
data = data.drop(columns=["timestamp"])
final_data = data.filter(items=["userId","movieId","genres","transaction_month","transaction_from_movie_year","rating"])

In [28]:
final_data["lag_rate1"] = final_data["rating"].shift(periods=1, fill_value=0)
final_data["lag_rate2"] = final_data["rating"].shift(periods=2, fill_value=0)
final_data["lag_rate3"] = final_data["rating"].shift(periods=3, fill_value=0)

In [29]:
# Use pd.get_dummies() to convert the column of lists into dummy variables
dummies_genres = pd.get_dummies(final_data['genres'].apply(pd.Series).stack()).groupby(level=0).sum()
dummies_the_rest = pd.get_dummies(final_data[["transaction_month","userId","movieId"]].astype(str))

In [30]:
cats_ohe = pd.concat([dummies_genres, dummies_the_rest],axis=1)

In [31]:
final_data = pd.concat([final_data,cats_ohe], axis=1)
final_data = final_data.drop(columns=["genres","transaction_month","userId","movieId"])

In [32]:
final_data

Unnamed: 0_level_0,transaction_from_movie_year,rating,lag_rate1,lag_rate2,lag_rate3,(no genres listed),Action,Adventure,Animation,Children,...,movieId_99750,movieId_99764,movieId_998,movieId_99813,movieId_99846,movieId_99853,movieId_999,movieId_99910,movieId_99917,movieId_99992
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1996-03-29 18:36:55,6.0,5.0,0.0,0.0,0.0,0,6,6,2,4,...,0,0,0,0,0,0,0,0,0,0
1996-03-29 18:36:55,1.0,4.0,5.0,0.0,0.0,0,6,6,2,4,...,0,0,0,0,0,0,0,0,0,0
1996-03-29 18:36:55,3.0,4.0,4.0,5.0,0.0,0,6,6,2,4,...,0,0,0,0,0,0,0,0,0,0
1996-03-29 18:36:55,7.0,5.0,4.0,4.0,5.0,0,6,6,2,4,...,0,0,0,0,0,0,0,0,0,0
1996-03-29 18:36:55,2.0,4.0,5.0,4.0,4.0,0,6,6,2,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-09-23 03:55:27,0.0,2.5,4.0,4.0,4.0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2018-09-23 03:55:46,0.0,3.0,2.5,4.0,4.0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2018-09-24 02:44:00,41.0,2.5,3.0,2.5,4.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2018-09-24 02:44:19,38.0,1.5,2.5,3.0,2.5,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
final_data.index = final_data.index.sort_values()
final_data = final_data[:50000]
train = final_data.iloc[:int(final_data.shape[0]*0.8)]
y_train = train["rating"]
x_train = train.drop(columns=["rating"])
test =  final_data.iloc[int(final_data.shape[0]*0.8):]
y_test = test["rating"]
x_test = test.drop(columns=["rating"])

In [36]:
x_train.shape, x_test.shape

((40000, 10370), (10000, 10370))

In [37]:
import xgboost as xgb

In [38]:
model = xgb.XGBRegressor(random_state=42, n_jobs=-1, max_depth=10, n_estimators=10)
model.fit(x_train, y_train)

In [40]:
RMSPE_eval = calc_RMSPE(model.predict(x_test), y_test)
print(f"The value of RMSPE for test data {RMSPE_eval}")

The value of RMSPE for test data 0.26890357879314736


In [59]:
model.predict(x_test[:1])

array([2.5727823], dtype=float32)

In [60]:
y_test[:1]

timestamp
2005-05-31 02:02:56    3.0
Name: rating, dtype: float64

In [61]:
import pickle

# Save the model to a file using pickle
with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Predict

In [62]:
with open('xgboost_model.pkl', 'rb') as f:
    model = pickle.load(f)
    
model.predict(x_test[:1])

array([2.5727823], dtype=float32)