In [256]:
import os
import zipfile
import pandas as pd
from datetime import datetime
from dateutil import parser
import numpy as np
from feature_engine.creation import CyclicalFeatures
import re
import spacy 
from sklearn.preprocessing import OrdinalEncoder

In [257]:
data_path = os.path.join('../data/raw')
with zipfile.ZipFile(os.path.join(data_path,'ml-100k.zip'), 'r') as zip_ref:
    zip_ref.extractall(data_path)
data_path = os.path.join('../data/raw/ml-100k')

## Explore and preprocess u.data

In [258]:
df = pd.read_csv(os.path.join(data_path, "u.data"), sep="\t", header=None)
df.columns = ["user_id", "item_id", "rating", "timestamp"]
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [259]:
df["timestamp"] = df["timestamp"].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%d-%b-%Y'))
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,04-Dec-1997
1,186,302,3,04-Apr-1998
2,22,377,1,07-Nov-1997
3,244,51,2,27-Nov-1997
4,166,346,1,02-Feb-1998
...,...,...,...,...
99995,880,476,3,22-Nov-1997
99996,716,204,5,17-Nov-1997
99997,276,1090,1,20-Sep-1997
99998,13,225,2,17-Dec-1997


In [260]:
df.isna().sum()

user_id      0
item_id      0
rating       0
timestamp    0
dtype: int64

In [261]:
def date_encoder(date_series):
    date_series = date_series.apply(lambda x: (parser.parse(x).day, parser.parse(x).month, parser.parse(x).year)).apply(pd.Series)
    date_series.columns = ["day", "month", "year"]
    cyclical = CyclicalFeatures(variables=["day", "month"], drop_original=True)
    date_series = cyclical.fit_transform(date_series)
    date_series["year"] = (date_series["year"]-1900)/(2000-1900)
    return date_series
    

In [262]:
date_df = date_encoder(df["timestamp"])
df = pd.concat([df, date_df], axis=1)
df = df.drop(columns=["timestamp"])

## Explore and preprocess u.item

In [263]:
df2 = pd.read_csv(os.path.join(data_path, "u.item"), sep="|", encoding='latin-1', header=None, index_col=0)
df2.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,14,15,16,17,18,19,20,21,22,23
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [264]:
df2.isna().sum()

1        0
2        1
3     1682
4        3
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
dtype: int64

From this we can see that we need to drop the 3rd column, sincce all values in it are NaN
<br>
We can drop the 4th column, since links are not usefull
<br>
And we have NaN value in release_date column, so we can just remove this raw

In [265]:
df2 = df2.drop(columns=[3,4])
df2 = df2.dropna()
df2.head()

Unnamed: 0_level_0,1,2,5,6,7,8,9,10,11,12,...,14,15,16,17,18,19,20,21,22,23
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [266]:
date_df2 = date_encoder(df2[2])
df2 = pd.concat([df2, date_df2], axis=1)
df2 = df2.drop(columns=[2])
df2.head()

Unnamed: 0_level_0,1,5,6,7,8,9,10,11,12,13,...,19,20,21,22,23,year,day_sin,day_cos,month_sin,month_cos
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0.95,0.201299,0.97953,0.5,0.866025
2,GoldenEye (1995),0,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0.95,0.201299,0.97953,0.5,0.866025
3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0.95,0.201299,0.97953,0.5,0.866025
4,Get Shorty (1995),0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0.95,0.201299,0.97953,0.5,0.866025
5,Copycat (1995),0,0,0,0,0,0,1,0,1,...,0,0,1,0,0,0.95,0.201299,0.97953,0.5,0.866025


In [267]:
def preprocess_title(title_series):
    year_series = title_series.apply(lambda x: int(re.findall('\((\d{4})\)(?:(?!\(\d{4}\)).)*$', x.rstrip())[0]))
    year_series = (year_series-1900)/(2000-1900)
    title_series = title_series.apply(lambda x: re.sub('\((\d{4})\)(?:(?!\(\d{4}\)).)*$', '', x.rstrip()).rstrip())
    devided_series = pd.concat([title_series, year_series], axis=1)
    devided_series.columns = ["title", "release_year"]
    return devided_series

In [268]:
def embed_title(title_series):
    nlp = spacy.load("en_core_web_sm")
    tokenizer = nlp.tokenizer
    embedded_title = []
    for text in title_series:
        doc = nlp(tokenizer(text)) # Extract word embeddings
        embedded_title.append(doc.vector)

    # Convert the embedded data to a DataFrame
    titles_embedded = pd.DataFrame(embedded_title)
    return titles_embedded

In [269]:

df2 = pd.concat([df2, preprocess_title(df2[1])], axis=1)
df2 = df2.drop(columns=[1])
df2.head()

Unnamed: 0_level_0,5,6,7,8,9,10,11,12,13,14,...,21,22,23,year,day_sin,day_cos,month_sin,month_cos,title,release_year
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0.95,0.201299,0.97953,0.5,0.866025,Toy Story,0.95
2,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0.95,0.201299,0.97953,0.5,0.866025,GoldenEye,0.95
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0.95,0.201299,0.97953,0.5,0.866025,Four Rooms,0.95
4,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0.95,0.201299,0.97953,0.5,0.866025,Get Shorty,0.95
5,0,0,0,0,0,0,1,0,1,0,...,1,0,0,0.95,0.201299,0.97953,0.5,0.866025,Copycat,0.95


In [270]:
df2

Unnamed: 0_level_0,5,6,7,8,9,10,11,12,13,14,...,21,22,23,year,day_sin,day_cos,month_sin,month_cos,title,release_year
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0.95,0.201299,0.979530,0.500000,8.660254e-01,Toy Story,0.95
2,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0.95,0.201299,0.979530,0.500000,8.660254e-01,GoldenEye,0.95
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0.95,0.201299,0.979530,0.500000,8.660254e-01,Four Rooms,0.95
4,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0.95,0.201299,0.979530,0.500000,8.660254e-01,Get Shorty,0.95
5,0,0,0,0,0,0,1,0,1,0,...,1,0,0,0.95,0.201299,0.979530,0.500000,8.660254e-01,Copycat,0.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0.98,0.937752,0.347305,0.866025,5.000000e-01,Mat' i syn,0.97
1679,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0.98,0.937752,0.347305,0.866025,5.000000e-01,B. Monkey,0.98
1680,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0.98,0.201299,0.979530,0.500000,8.660254e-01,Sliding Doors,0.98
1681,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0.94,0.201299,0.979530,0.500000,8.660254e-01,You So Crazy,0.94


In [271]:
df2 = pd.concat([df2, embed_title(df2["title"])], axis=1)
df2 = df2.drop(columns=["title"])
df2 = df2.dropna()
df2.head()

Unnamed: 0,5,6,7,8,9,10,11,12,13,14,...,86,87,88,89,90,91,92,93,94,95
1,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.549181,-0.129408,-0.533101,-0.451775,-1.237758,-0.19394,2.175473,-0.205618,0.169546,1.099319
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.815718,0.151895,-0.223589,0.331157,-0.593922,-0.616442,2.066849,0.362578,0.192172,0.966224
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.018125,-0.712686,-1.003018,-0.109466,-0.086747,-0.519322,1.551831,-0.361525,0.390277,1.138311
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,-0.580432,0.079627,-0.244669,-0.329613,-1.106851,-0.048058,2.611301,0.377608,0.864265,1.025843
5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.113742,-0.024781,-0.510307,0.61111,0.073536,0.005714,1.246995,0.15833,0.095598,0.478416


In [272]:
df2.isna().sum()

5     0
6     0
7     0
8     0
9     0
     ..
91    0
92    0
93    0
94    0
95    0
Length: 121, dtype: int64

## Explore and preprocess u.user

In [273]:
df3 = pd.read_csv(os.path.join(data_path, "u.user"), sep= "|", encoding='latin-1', header=None, index_col=0, names=["id", "age", "gender", "occupation", "zip_code"])

In [274]:
df3.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [275]:
df3.drop(columns=["zip_code"])

Unnamed: 0_level_0,age,gender,occupation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,24,M,technician
2,53,F,other
3,23,M,writer
4,24,M,technician
5,33,F,other
...,...,...,...
939,26,F,student
940,32,M,administrator
941,20,M,student
942,48,F,librarian


In [276]:
df3.isna().sum()

age           0
gender        0
occupation    0
zip_code      0
dtype: int64

In [277]:
df3 = pd.concat([df3, pd.get_dummies(df3["occupation"]).astype(int)], axis=1) 
df3 = df3.drop(columns=["occupation"])

In [278]:
encoder = OrdinalEncoder()
encoded_gender = encoder.fit_transform(df3["gender"].values.reshape(-1, 1))
encoded_gender = pd.Series(encoded_gender.flatten())
encoded_gender.index+=1
df3["gender"] = encoded_gender

In [279]:
df3["age"] = (df3["age"]-df3["age"].min())/(df3["age"].max()-df3["age"].min())

## Add data to u.data

In [280]:
df = df.drop_duplicates(subset=["user_id", "item_id"])

In [281]:
df = df[df["item_id"].isin(df2.index)]

In [282]:
df3.isna().sum()

age              0
gender           0
zip_code         0
administrator    0
artist           0
doctor           0
educator         0
engineer         0
entertainment    0
executive        0
healthcare       0
homemaker        0
lawyer           0
librarian        0
marketing        0
none             0
other            0
programmer       0
retired          0
salesman         0
scientist        0
student          0
technician       0
writer           0
dtype: int64

In [283]:
df2 = pd.merge(df2, df, left_index=True, right_on='item_id', how='right')

In [286]:
df2 = pd.merge(df2, df3, left_on=["user_id"], right_index=True, how='right')