In [31]:
import os
import zipfile
import pandas as pd
from datetime import datetime
from dateutil import parser
import numpy as np
from feature_engine.creation import CyclicalFeatures
import re
import spacy 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

In [32]:
data_path = os.path.join('../data/raw')
with zipfile.ZipFile(os.path.join(data_path,'ml-100k.zip'), 'r') as zip_ref:
    zip_ref.extractall(data_path)
data_path = os.path.join('../data/raw/ml-100k')

In [33]:
data = pd.read_csv(os.path.join(data_path, "u.data"), sep="\t", header=None)
data.columns = ["user_id", "item_id", "rating", "timestamp"]
item = pd.read_csv(os.path.join(data_path, "u.item"), sep="|", encoding='latin-1', header=None, index_col=0)
user = pd.read_csv(os.path.join(data_path, "u.user"), sep= "|", encoding='latin-1', header=None, index_col=0, names=["id", "age", "gender", "occupation", "zip_code"])

In [34]:
def date_encoder(date_series):
    date_series = date_series.apply(lambda x: (parser.parse(x).day, parser.parse(x).month, parser.parse(x).year)).apply(pd.Series)
    date_series.columns = ["day", "month", "year"]
    cyclical = CyclicalFeatures(variables=["day", "month"], drop_original=True)
    date_series = cyclical.fit_transform(date_series)
    date_series["year"] = (date_series["year"]-1900)/(2000-1900)
    return date_series

In [35]:
def preprocess_title(title_series):
    year_series = title_series.apply(lambda x: int(re.findall('\((\d{4})\)(?:(?!\(\d{4}\)).)*$', x.rstrip())[0]))
    year_series = (year_series-1900)/(2000-1900)
    title_series = title_series.apply(lambda x: re.sub('\((\d{4})\)(?:(?!\(\d{4}\)).)*$', '', x.rstrip()).rstrip())
    devided_series = pd.concat([title_series, year_series], axis=1)
    devided_series.columns = ["title", "release_year"]
    return devided_series

In [36]:
def embed_title(title_series):
    nlp = spacy.load("en_core_web_sm")
    tokenizer = nlp.tokenizer
    embedded_title = []
    for text in title_series:
        doc = nlp(tokenizer(text)) # Extract word embeddings
        embedded_title.append(doc.vector)

    # Convert the embedded data to a DataFrame
    titles_embedded = pd.DataFrame(embedded_title)
    return titles_embedded

In [37]:
def preprocess_data(df, df2, df3):
    # Encode timestamd in u.data
    df["timestamp"] = df["timestamp"].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%d-%b-%Y'))
    date_df = date_encoder(df["timestamp"])
    df = pd.concat([df, date_df], axis=1)
    df = df.drop(columns=["timestamp"])

    # Remove NaN values in u.item
    df2 = df2.drop(columns=[3,4])
    df2 = df2.dropna()

    # Encode video release date in u.item
    date_df2 = date_encoder(df2[2])
    df2 = pd.concat([df2, date_df2], axis=1)
    df2 = df2.drop(columns=[2])

    # Get release year from title in u.item
    df2 = pd.concat([df2, preprocess_title(df2[1])], axis=1)
    df2 = df2.drop(columns=[1])

    # Embed titles
    df2 = pd.concat([df2, embed_title(df2["title"])], axis=1)
    df2 = df2.drop(columns=["title"])
    df2 = df2.dropna()

    # Encode occupation
    df3 = pd.concat([df3, pd.get_dummies(df3["occupation"]).astype(int)], axis=1) 
    df3 = df3.drop(columns=["occupation"])

    # Encode gender
    encoder = OrdinalEncoder()
    encoded_gender = encoder.fit_transform(df3["gender"].values.reshape(-1, 1))
    encoded_gender = pd.Series(encoded_gender.flatten())
    encoded_gender.index+=1
    df3["gender"] = encoded_gender

    # Normalize age
    df3["age"] = (df3["age"]-df3["age"].min())/(df3["age"].max()-df3["age"].min())

    # Remove unused data
    df3 = df3.drop(columns=["zip_code"])

    # Filter u.data for unexisting indicies
    df = df.drop_duplicates(subset=["user_id", "item_id"])
    df = df[df["item_id"].isin(df2.index)]
    df = df[df["user_id"].isin(df3.index)]

    # Merge DataFrames at one
    df2 = pd.merge(df2, df, left_index=True, right_on='item_id', how='right')
    df2 = pd.merge(df2, df3, left_on=["user_id"], right_index=True, how='left')



    return df2.drop(columns=["rating"]), df2["rating"]

In [38]:
train, test = train_test_split(data, test_size=0.2, random_state=13)
train_x, train_y = preprocess_data(train, item, user)
test_x, test_y = preprocess_data(test, item, user)

In [46]:
data_processed_path = os.path.join('../data/interim')
data_evaluation_path = os.path.join('../benchmark/data/')
train_x.to_csv(os.path.join(data_processed_path, 'train_x.csv'), sep='|', index=False)
train_y.to_csv(os.path.join(data_processed_path, 'train_y.csv'), sep='|', index=False)
test_x.to_csv(os.path.join(data_processed_path, 'test_x.csv'), sep='|', index=False)
test_y.to_csv(os.path.join(data_processed_path, 'test_y.csv'), sep='|', index=False)
test_x.to_csv(os.path.join(data_evaluation_path, 'test_x.csv'), sep='|', index=False)
test_y.to_csv(os.path.join(data_evaluation_path, 'test_y.csv'), sep='|', index=False)

In [40]:
train_x.head()

Unnamed: 0,5,6,7,8,9,10,11,12,13,14,...,marketing,none,other,programmer,retired,salesman,scientist,student,technician,writer
61975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
75905,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,1,0,0
90174,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
60156,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
62373,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
preprocessed_data, rating = preprocess_data(data, item, user)
preprocessed_data.to_csv(os.path.join(data_processed_path, 'preprocessed_data.csv'), sep='|', index=False)

In [45]:
train_x.isna().sum().sum()

0