# https://www.kaggle.com/code/leopoldvonranke/ffm-with-pytorch#Create-Dataset

In [1]:
import pandas as pd

In [2]:
import numpy as np
import random
import os
import gc

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(113) # Seed 고정

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv( 'test.csv')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [5]:
def preprocessing_data(df):
    # Decompose Location by split
    df['City'] = [(i.split(',')[0]).lstrip().title() for i in df['Location']]
    df['State'] = [(i.split(',')[1]).lstrip().title() for i in df['Location']]
    df['Country'] = [(i.split(',')[2]).lstrip().title() for i in df['Location']]
    
    # preprocessing
    # NaN, N/A, etc.. Change 'unknown'
    # Only using Train Data	#
    #df['City'] = np.where((df['City'] == '')|(df['City'].astype(str) == 'nan')|(df['City'].astype(str) == 'N/A'), 'UnKnown', df['City'])
    #df['State'] = np.where((df['State'] == '')|(df['State'].astype(str) == 'nan')|(df['State'].astype(str) == 'N/A'), 'UnKnown', df['State'])
    #df['Country'] = np.where((df['Country'] == '')|(df['Country'].astype(str) == 'nan')|(df['Country'].astype(str) == 'N/A')
    #                                              |(df['Country'].astype(str) == 'N/A - On The Road')|(df['Country'].astype(str) == 'X')
    #                                              |(df['Country'].astype(str) == 'Far Away...')|(df['Country'].astype(str) == 'C')
    #                                              |(df['Country'].astype(str) == 'We`Re Global!')|(df['Country'].astype(str) == 'Travelling....')
    #                                              #영원한 중국!
    #                                              |(df['Country'].astype(str) == 'La Chine Éternelle !')
    #                         ,'UnKnown', df['Country'])
	#
    ## train에 있는 정보로 County 바꾸기
    #df.loc[df['State'] == 'Michigan Usa', 'Country'] = 'Usa'
    #df.loc[df['State'] == 'Wicklow', 'Country'] = 'Ireland'
    #df.loc[df['State'] == 'Ilfov', 'Country'] = ''
    #df.loc[df['State'] == 'Liege', 'Country'] = 'Belgium'
    #df.loc[df['State'] == 'Estremadura', 'Country'] = 'Portugal'
    #df.loc[df['State'] == 'Aberdeenshire', 'Country'] = 'United Kingdom'
    #df.loc[df['State'] == 'Wi', 'Country'] = 'Wisconsin'
    #
    ## 미국 Country name 통일
    #df.loc[df['Country'] == 'America', 'Country'] = 'Usa'
    #df.loc[df['Country'] == 'United State', 'Country'] = 'Usa'
    #df.loc[df['Country'] == 'United States', 'Country'] = 'Usa'
    #df.loc[df['Country'] == 'U.S.A.', 'Country'] = 'Usa'
    #df.loc[df['Country'] == 'New York', 'Country'] = 'Usa'
    #df.loc[df['Country'] == 'U.S. Of A.', 'Country'] = 'Usa'
    #df.loc[df['Country'] == 'United Staes', 'Country'] = 'Usa'
    #df.loc[df['Country'] == 'U.S>', 'Country'] = 'Usa'
    #df.loc[df['Country'] == 'Usa Now', 'Country'] = 'Usa'
    #
    ## 영국 Country name 통일
    #df.loc[df['Country'] == 'England', 'Country'] = 'United Kingdom'
    #df.loc[df['Country'] == 'Scotland', 'Country'] = 'United Kingdom'
    #df.loc[df['Country'] == 'Wales', 'Country'] = 'United Kingdom'
    #df.loc[df['Country'] == 'Ireland', 'Country'] = 'United Kingdom'
    #df.loc[df['Country'] == 'U.K.', 'Country'] = 'United Kingdom'
    #df.loc[df['Country'] == 'Usa (Currently Living In England)', 'Country'] = 'United Kingdom'
    #df.loc[df['Country'] == 'Uk', 'Country'] = 'United Kingdom'
    #
    ## 스페인 Country name 통일
    #df.loc[df['Country'] == 'España ', 'Country'] = 'Spain'
    
    return df
    

In [6]:
#from lingua import Language, LanguageDetectorBuilder
from tqdm import tqdm

def feature_engineering(df):
    # Book-Title로 도서 언어분류 (과적합 남)
    #df['prep_title'] = df['Book-Title'].apply(preprocessing_sentence)
    #detector = LanguageDetectorBuilder.from_all_languages().with_preloaded_language_models().with_low_accuracy_mode().build()
    #
    #all = []
    #for row in tqdm(df['prep_title'].unique()):
    #    try:
    #        language = detector.detect_language_of(row)
    #    except:
    #        language = "error"
    #        print("This row throws and error:", row)
    #    all.append(language)
    #prep_title = pd.DataFrame(df['prep_title'].unique(), columns = ['prep_title'])
    #all = [i.name if str(i) != 'None' else 'None' for i in all]
    #Language = pd.DataFrame(all, columns = ['Language'])
    #prep_title = pd.concat([prep_title, Language], axis = 1)
    #df = pd.merge(df, prep_title, on = 'prep_title', how = 'inner')
    #
    #df = df.drop(columns = ['prep_title'])
    
    # Age 그룹화    
    #labels = ['0-3','3-6','6-8','8-12','12-18','18-54','55-64','65+']
    #bins = [0, 3, 6, 8, 12, 18, 54, 64, 250]
    labels = ['0-3','3-6','6-8','8-12','12-18','18-25','25-34','35-44','45-54','55-64','65-74','75+']
    bins = [0, 3, 6, 8, 12, 18, 25, 34, 44, 54, 64, 74, 250]
    
    # Age 이상치 처리
    df['Age'] = df['Age'].apply(lambda x: 3 if x<3 else x)
    df['Age'] = df['Age'].apply(lambda x: 100 if x>100 else x)
    
    df['Age_gb'] = pd.cut(df.Age, bins, labels = labels,include_lowest = True)
    
    # 출판년도 그룹화
    #labels = ['Unknown', '-1900', '1900-1970', '1970-2000', '2000-2010', '2010-2020', '2020-']
    #bins = [-1, 0, 1900, 1970, 2000, 2010, 2020, 3000]
    labels = ['Unknown', '-1900', '1900-1950', '1950-1960', '1960-1970', '1970-1980', '1980-1990', '1990-2000', '2000-2010', '2010-2020', '2020-']
    bins = [-1, 0, 1900, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020, 3000]
    df['Pub_gb'] = pd.cut(df['Year-Of-Publication'], bins, labels = labels,include_lowest = True)
    
    return df

In [7]:
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder

FEATURE = ['User-ID','Book-Title','Book-Author','Publisher', 'City','State','Country','Age_gb', 'Pub_gb']#', 'Language'] ', 

train_lb = train.__deepcopy__() 
test_lb = test.__deepcopy__()

train_lb = feature_engineering(train_lb)
test_lb = feature_engineering(test_lb)

train_lb = preprocessing_data(train_lb)
test_lb = preprocessing_data(test_lb)

train_lb = train_lb.drop(columns = ['Book-ID', 'Location'])
test_lb = test_lb.drop(columns = ['Book-ID', 'Location'])

train_lb[FEATURE] = train_lb[FEATURE].astype(str) 
test_lb[FEATURE] = test_lb[FEATURE].astype(str)


for i in FEATURE:
    le = LabelEncoder()
    le=le.fit(train_lb[i])
    for label in np.unique(test_lb[i].dropna()):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    train_lb[i] = le.transform(train_lb[i])
    test_lb[i] = le.transform(test_lb[i])
    
#for i in FEATURE:
#    # train에는 없고, test에는 있는 원소는 -2 처리
#    oe = OrdinalEncoder(handle_unknown='use_encoded_value',
#                         unknown_value=-2)
#    oe=oe.fit(train_lb[i].to_numpy().reshape(-1, 1))
#    train_lb[i] = oe.transform(train_lb[i].to_numpy().reshape(-1, 1))
#    test_lb[i] = oe.transform(test_lb[i].to_numpy().reshape(-1, 1))

In [8]:
train_lb.to_csv('train_lb.csv')
test_lb.to_csv('test_lb.csv')

In [9]:
train_lb = pd.read_csv('train_lb.csv')
test_lb = pd.read_csv('test_lb.csv')

In [10]:
X_train = train_lb.drop(columns = ['Unnamed: 0','ID', 'Book-Rating', 'Age', 'Year-Of-Publication'])
y_train = train_lb['Book-Rating']
x_test = test_lb.drop(columns = ['Unnamed: 0','ID', 'Age', 'Year-Of-Publication'])

In [11]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import MSELoss
from sklearn.model_selection import train_test_split
from tqdm import tqdm

class FeaturesLinear(torch.nn.Module) :
    def __init__(self, field_dims) :
        '''
        Parameter
            field_dims : List of field dimensions
        '''
        super().__init__()
        self.input_dim = sum(field_dims)
        self.linear = nn.Linear(self.input_dim, 1, bias=True)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int32)

    def forward(self, x):
        '''
        Parameter
            x : Long tensor of size (batch_size, num_fields)
        
        Return
            linear_term : Float tensor of size (batch_size)
        '''
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        sparse_x = torch.zeros(x.size(0), self.input_dim, device=x.device).scatter_(1, x, 1.)
        linear_term = self.linear(sparse_x)
        return linear_term


In [12]:
class FieldAwareEmbedding(torch.nn.Module) :

    def __init__(self, field_dims, embed_dim) :
        '''
        Parameter
            field_dims : List of field dimensions
            embed_dim : Factorization dimension for dense embedding
        '''
        super().__init__()
        self.num_fields = len(field_dims)
        self.embeddings = nn.ModuleList([
            nn.Embedding(feature_size, embed_dim) for feature_size in field_dims
        ])
        for embedding in self.embeddings:
            torch.nn.init.xavier_uniform_(embedding.weight.data)
        
    def forward(self, x) :
        '''
        Parameter
            x : Long tensor of size (batch_size, num_fields)
        
        Return
            dense_x : Long tensor of size (batch_size, num_fields, embed_dim)
        '''
        dense_x = [self.embeddings[i](x[..., i]) for i in range(self.num_fields)]
        dense_x = torch.stack(dense_x, dim=1)
        return dense_x


In [13]:
class FieldAwareFactorizationMachine(torch.nn.Module) :
    def __init__(self, field_dims, embed_dim) :
        '''
        Parameter
            field_dims : List of field dimensions
            embed_dim : Factorization dimension for dense embedding
        '''
        super().__init__()
        self.num_fields = len(field_dims)
        self.embeddings = FieldAwareEmbedding(field_dims, embed_dim)
        self.linear = FeaturesLinear(field_dims)
        
    def square(self, x):
        return torch.pow(x,2)

    def forward(self, x) :
        '''
        Parameter
            x : Long tensor of size (batch_size, num_fields)
        
        Return
            y_ffm : Float tensor of size (batch_size)
        '''
        linear_term = self.linear(x)
        
        dense_x = self.embeddings(x)

        square_of_sum = self.square(torch.sum(dense_x, dim=1))
        sum_of_square = torch.sum(self.square(dense_x), dim=1)
        pairwise_term = 0.5 * torch.sum(square_of_sum - sum_of_square, dim=1)
        
        y_ffm = linear_term.squeeze(1) + pairwise_term

        return y_ffm


In [14]:
X_tr, X_valid, y_tr, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42,
                                                      shuffle=True, stratify=y_train)


In [15]:
train_dataset = TensorDataset(torch.LongTensor(X_tr.to_numpy()),
                              torch.LongTensor(y_tr.to_numpy()))
valid_dataset = TensorDataset(torch.LongTensor(X_valid.to_numpy()),
                              torch.LongTensor(y_valid.to_numpy()))

test_dataset = TensorDataset(torch.LongTensor(x_test.to_numpy()),
                              torch.LongTensor(np.zeros(x_test.shape[0])))

In [16]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()
        self.eps = 1e-6
    def forward(self, x, y):
        criterion = MSELoss()
        loss = torch.sqrt(criterion(x, y)+self.eps)
        return loss


In [17]:
def model_train(model, dataloader, loss_fn, optimizer, device) :
    model.train()
    total_loss = 0
    for features, targets in tqdm(dataloader) :
        features = features.to(device)
        targets = targets.to(torch.float32).to(device)
        
        outputs = model(features)
        loss = loss_fn(targets, outputs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss
        
    return (total_loss/len(dataloader)).item()


In [18]:
def model_eval(model, dataloader, loss_fn, device) :
    with torch.no_grad() :
        total_loss = 0
        model.eval()
        for features, targets in tqdm(dataloader) :
            features = features.to(device)
            targets = targets.to(torch.float32).to(device)
            
            outputs = model(features)
            loss = loss_fn(targets, outputs)
            total_loss += loss
            
    return (total_loss/len(dataloader)).item()


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [20]:
features = X_train.columns
idx = {feature:None for feature in features}
for feature in features :
    feature2idx = {v:k for k,v in enumerate(X_train[feature].unique())}
    idx[feature] = len(feature2idx)
    X_train[feature] = X_train[feature].map(feature2idx)


In [21]:
field_dims = np.array(list(idx.values()), dtype=np.uint32)
print(field_dims)

[ 83256 217829  92635  15505     12     11  13820   1810    348]


In [22]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

embed_dim = 4
batch_size = 128
lr = 0.0001
epochs = 30

In [None]:
ffm_model = FieldAwareFactorizationMachine(field_dims, embed_dim=embed_dim)
ffm_model.to(device)


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

loss_fn = RMSELoss()
optimizer = torch.optim.Adam(ffm_model.parameters(), lr=lr)

best_loss = int(1e8)
check_cnt = 0

for epoch in range(epochs) :
    train_loss = model_train(ffm_model, train_loader, loss_fn, optimizer, device)
    valid_loss = model_eval(ffm_model, valid_loader, loss_fn, device)
    
    print(f"Epoch {epoch+1}")
    print(f"Train Loss : {train_loss:.2f}, Validation Loss : {valid_loss:.2f}")
    
    if best_loss > valid_loss :
        best_loss = valid_loss
        torch.save(ffm_model.state_dict(), 'ffm_model.pt')
        check_cnt = 0
    else :
        check_cnt += 1
        if check_cnt == 5 :
            print("Early Stopped")
            break

100%|██████████████████████████████████████████████████████████████████████████████| 5447/5447 [00:44<00:00, 122.95it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1362/1362 [00:04<00:00, 306.72it/s]


Epoch 1
Train Loss : 4.01, Validation Loss : 3.67


100%|██████████████████████████████████████████████████████████████████████████████| 5447/5447 [00:44<00:00, 121.18it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1362/1362 [00:04<00:00, 308.21it/s]


Epoch 2
Train Loss : 3.58, Validation Loss : 3.54


100%|██████████████████████████████████████████████████████████████████████████████| 5447/5447 [00:52<00:00, 104.65it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1362/1362 [00:04<00:00, 280.49it/s]


Epoch 3
Train Loss : 3.46, Validation Loss : 3.47


 40%|███████████████████████████████▏                                              | 2174/5447 [00:18<00:27, 119.22it/s]

In [None]:
best_model = FieldAwareFactorizationMachine(field_dims, embed_dim).to(device)
best_model.load_state_dict(torch.load('ffm_model.pt'))