In [5]:
import pickle
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

label : preference <br>
features : country_id, item_id, gdp growth, pop growth <br>
참고) 2012년은 gdp growth, pop growth NaN값이라 사용 x

## 데이터 불러오기

In [6]:
# pickle 파일 불러오기
with open('/home/sjkim/추천공모전/data/export_GDP_pop_fx.pkl', 'rb') as f:
    ratings_all = pickle.load(f)

In [7]:
ratings_all

Unnamed: 0,year,country,품목코드,수출금액,GDP,preference,GDP_growth,population,pop_growth,USD,JPY,EUR,CNY
0,2012,Afghanistan,3.0,1.0,2.020357e+07,4.949620e-08,,30466479.0,,1126.43,1412.96,1447.58,178.52
1,2012,Afghanistan,4.0,0.0,2.020357e+07,0.000000e+00,,30466479.0,,1126.43,1412.96,1447.58,178.52
2,2012,Afghanistan,7.0,2.0,2.020357e+07,9.899239e-08,,30466479.0,,1126.43,1412.96,1447.58,178.52
3,2012,Afghanistan,8.0,0.0,2.020357e+07,0.000000e+00,,30466479.0,,1126.43,1412.96,1447.58,178.52
4,2012,Afghanistan,10.0,12.0,2.020357e+07,5.939543e-07,,30466479.0,,1126.43,1412.96,1447.58,178.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...
116390,2021,Zimbabwe,87.0,1289.0,2.837124e+07,4.543334e-05,0.318998,15993524.0,0.020668,1145.07,1041.92,1353.40,177.56
116391,2021,Zimbabwe,90.0,322.0,2.837124e+07,1.134952e-05,0.318998,15993524.0,0.020668,1145.07,1041.92,1353.40,177.56
116392,2021,Zimbabwe,94.0,1.0,2.837124e+07,3.524696e-08,0.318998,15993524.0,0.020668,1145.07,1041.92,1353.40,177.56
116393,2021,Zimbabwe,95.0,13.0,2.837124e+07,4.582105e-07,0.318998,15993524.0,0.020668,1145.07,1041.92,1353.40,177.56


In [8]:
# 2012년 제외
ratings = ratings_all[ratings_all['year'] != 2012]
# 필요한 컬럼만 가져오기
ratings = ratings[['year', 'country', '품목코드', 'GDP_growth', 'pop_growth', 'preference']]
# column명 변경
ratings.columns=['year', 'country_id', 'item_id', 'GDP_growth', 'pop_growth', 'preference']
# item_id int로 바꾸기
ratings['item_id'] = ratings['item_id'].astype(int)
# country에 고유번호 부여하기
country_to_idx = {v:k for k,v in enumerate(ratings['country_id'].unique())}
idx_to_country = {v:k for k,v in country_to_idx.items()}

# country_id를 고유번호로 바꾸기
ratings['country_id'] = ratings['country_id'].map(country_to_idx)

# 중복제거
ratings = ratings.drop_duplicates(['year', 'country_id', 'item_id'], keep='first')
# index 초기화
ratings = ratings.reset_index(drop=True)
ratings

Unnamed: 0,year,country_id,item_id,GDP_growth,pop_growth,preference
0,2013,0,10,0.017864,0.035276,4.862752e-08
1,2013,0,16,0.017864,0.035276,6.321578e-07
2,2013,0,19,0.017864,0.035276,1.799218e-06
3,2013,0,20,0.017864,0.035276,4.862752e-08
4,2013,0,21,0.017864,0.035276,6.905108e-06
...,...,...,...,...,...,...
105364,2021,202,87,0.318998,0.020668,4.543334e-05
105365,2021,202,90,0.318998,0.020668,1.134952e-05
105366,2021,202,94,0.318998,0.020668,3.524696e-08
105367,2021,202,95,0.318998,0.020668,4.582105e-07


## FM Model

In [9]:
# year encoding
year_dict = {}
for i in set(ratings['year']):
    year_dict[i] = len(year_dict)
n_year = len(year_dict)                  # 9개

# Country encoding
country_dict = {}
start_point = n_year                     # 9부터 시작
for i in set(ratings['country_id']):
    country_dict[i] = len(country_dict)
n_country = len(country_dict)            # 203개
start_point += n_country                 # 9+203=212

# Item encoding
item_dict = {}
start_point = n_country                  # 212부터 시작
for i in set(ratings['item_id']):
    item_dict[i] = start_point + len(item_dict)
n_item = len(item_dict)                  # 96개
start_point += n_item                    # 212+96=308

# GDP_growth encoding   
gdp_index = start_point
start_point += 1

# pop_growth encoding
pop_index = start_point
start_point += 1
num_x = start_point                     # 전체 변수의 숫자 (년도 + 국가 개수 + 품목 개수 + GDP_growth + pop_growth) 총 310개

In [10]:
x = ratings
x = shuffle(x, random_state=12)
x

Unnamed: 0,year,country_id,item_id,GDP_growth,pop_growth,preference
72679,2019,62,71,-0.017862,-0.000578,7.297039e-07
29964,2015,129,52,0.071671,0.005397,0.000000e+00
62839,2018,94,63,0.022312,-0.001268,9.070635e-06
21326,2014,178,63,0.078603,0.023319,3.731120e-06
47421,2017,28,83,0.097223,-0.007278,2.685836e-06
...,...,...,...,...,...,...
36482,2016,42,62,-0.140528,0.024176,0.000000e+00
40177,2016,102,84,0.030096,-0.009097,4.902150e-04
19709,2014,148,58,0.000000,-0.016199,9.803922e-08
104091,2021,180,91,0.012000,0.001755,1.503953e-06


In [11]:
# Generate X data
data = []
y = []
w0 = np.mean(x['preference'])
for i in range(len(x)):
    case = x.iloc[i]
    x_index = []
    x_value = []
    x_index.append(year_dict[case['year']])        # year id one-hot encoding
    x_value.append(1.)                             # 있으면 1
    x_index.append(country_dict[case['country_id']])  # country id one-hot encoding
    x_value.append(1.)
    x_index.append(item_dict[case['item_id']])     # item id one-hot encoding
    x_value.append(1.)
    x_index.append(gdp_index)                      
    x_value.append(case['GDP_growth'])
    x_index.append(pop_index)
    x_value.append(case['pop_growth'])
    data.append([x_index, x_value])                # 만들어진 한줄의 데이터를([[year_index, country_index, item_index, gdp_index, pop_index], [1, 1, 1, 1, 1]] 형태) data에 저장
    y.append(case['preference'] - w0)              # case에 있는 선호도에서 전체 편향 뺀 값을 y에 저장함
    if (i % 1000) == 0:
        print('Encoding ', i, ' cases...')

Encoding  0  cases...
Encoding  1000  cases...
Encoding  2000  cases...
Encoding  3000  cases...
Encoding  4000  cases...
Encoding  5000  cases...
Encoding  6000  cases...
Encoding  7000  cases...
Encoding  8000  cases...
Encoding  9000  cases...
Encoding  10000  cases...
Encoding  11000  cases...
Encoding  12000  cases...
Encoding  13000  cases...
Encoding  14000  cases...
Encoding  15000  cases...
Encoding  16000  cases...
Encoding  17000  cases...
Encoding  18000  cases...
Encoding  19000  cases...
Encoding  20000  cases...
Encoding  21000  cases...
Encoding  22000  cases...
Encoding  23000  cases...
Encoding  24000  cases...
Encoding  25000  cases...
Encoding  26000  cases...
Encoding  27000  cases...
Encoding  28000  cases...
Encoding  29000  cases...
Encoding  30000  cases...
Encoding  31000  cases...
Encoding  32000  cases...
Encoding  33000  cases...
Encoding  34000  cases...
Encoding  35000  cases...
Encoding  36000  cases...
Encoding  37000  cases...
Encoding  38000  cases...

In [12]:
data[:3]

[[[3, 62, 273, 299, 300],
  [1.0, 1.0, 1.0, -0.017862396148309773, -0.000577804473577687]],
 [[8, 129, 254, 299, 300],
  [1.0, 1.0, 1.0, 0.07167081191546958, 0.005397218989687098]],
 [[2, 94, 265, 299, 300],
  [1.0, 1.0, 1.0, 0.02231237322515213, -0.0012679960936269415]]]

In [13]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

In [14]:
class FM():
    def __init__(self, N, K, data, y, alpha, beta, train_ratio=0.75, iterations=100, tolerance=0.0005, l2_reg=True, verbose=True):
        self.K = K                   # Number of latent factors
        self.N = N                   # Number of x (variables)
        self.n_cases = len(data)     # N of observations
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.l2_reg = l2_reg
        self.tolerance = tolerance
        self.verbose = verbose
        # w 초기화
        self.w = np.random.normal(scale=1./self.N, size=(self.N))          # 사이즈는 변수의 수 만큼
        # v 초기화
        self.v = np.random.normal(scale=1./self.K, size=(self.N, self.K))  # 사이즈는 (변수의 수 X K)
        # Train/Test 분리
        cutoff = int(train_ratio * len(data))
        self.train_x = data[:cutoff]
        self.test_x = data[cutoff:]
        self.train_y = y[:cutoff]
        self.test_y = y[cutoff:]

    def test(self):                                         # Training 하면서 RMSE 계산 
        # SGD를 iterations 숫자만큼 수행
        best_RMSE = 10000
        best_iteration = 0
        training_process = []
        for i in range(self.iterations):                     # iterations만큼 돌아감 (RMSE값이 올라가면 ealry stopping)
            rmse1 = self.sgd(self.train_x, self.train_y)     # SGD & Train set의 RMSE 계산
            rmse2 = self.test_rmse(self.test_x, self.test_y) # Test set의 RMSE 계산     
            training_process.append((i, rmse1, rmse2))
            if self.verbose:
                if (i+1) % 10 == 0:                     # 10번 돌때마다 표시
                    print("Iteration: %d ; Train RMSE = %.6f ; Test RMSE = %.6f" % (i+1, rmse1, rmse2))
            if best_RMSE > rmse2:                       # New best record
                best_RMSE = rmse2
                best_iteration = i
            elif (rmse2 - best_RMSE) > self.tolerance:  # RMSE is increasing over tolerance
                break
        print(best_iteration, best_RMSE)
        return training_process
        
    # w, v 업데이트를 위한 Stochastic gradient descent 
    def sgd(self, x_data, y_data):
        y_pred = []
        for data, y in zip(x_data, y_data):     # train set 크기 만큼
            x_idx = data[0]                     # data가 [[6, 88, 220, 299, 300], [1.0, 1.0, 1.0, 0.077, 0.047]] 이런 형식이니까
                                                # [6, 88, 220, 299, 300] 첫번째 원소(year, country_id, item_id, gdp, pop 인덱스)만 가져오기
            x_0 = np.array(data[1])             # 해당 x의 값[1.0, 1.0, 1.0, 0.077, 0.047]을 가져온다. xi axis=0 [1, 2, 3]  -> 1차원
            x_1 = x_0.reshape(-1, 1)            # x의 값을 2차원으로 변경(2차원인 v행렬과 연산하기 위해서). xi axis=1 [[1], [2], [3]] -> 2차원
    
            # biases
            bias_score = np.sum(self.w[x_idx] * x_0)
    
            # score 계산
            vx = self.v[x_idx] * (x_1)          # v matrix * x, 모든 v값에 x가 곱해짐. (5, 350) x
            sum_vx = np.sum(vx, axis=0)         # sigma(vx), element가 k개. k=350이니까 (350, )
            sum_vx_2 = np.sum(vx * vx, axis=0)  # ( v matrix * x )의 제곱. 얘도 k개 (350, )
            latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

            # 예측값 계산
            y_hat = bias_score + latent_score
            y_pred.append(y_hat)
            error = y - y_hat
            # w, v 업데이트
            if self.l2_reg:     # regularization이 있는 경우
                self.w[x_idx] += error * self.alpha * (x_0 - self.beta * self.w[x_idx])
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1) - self.beta * self.v[x_idx])
            else:               # regularization이 없는 경우
                self.w[x_idx] += error * self.alpha * x_0
                self.v[x_idx] += error * self.alpha * ((x_1) * sum(vx) - (vx * x_1))
        return RMSE(y_data, y_pred)     # 실제 값과 예측값 RMSE계산

    def test_rmse(self, x_data, y_data):            # test_set에 대해서 현재의 w,v를 사용해서 예측값을 계산하고 이에 따른 RMSE값 계산
        y_pred = []
        for data , y in zip(x_data, y_data):
            y_hat = self.predict(data[0], data[1])  # predict()함수를 호출해서 하나의 행에 대한 예측값 구하기
            y_pred.append(y_hat)
        return RMSE(y_data, y_pred)

    def predict(self, idx, x):
        x_0 = np.array(x)
        x_1 = x_0.reshape(-1, 1)

        # biases
        bias_score = np.sum(self.w[idx] * x_0)

        # score 계산
        vx = self.v[idx] * (x_1)
        sum_vx = np.sum(vx, axis=0)
        sum_vx_2 = np.sum(vx * vx, axis=0)
        latent_score = 0.5 * np.sum(np.square(sum_vx) - sum_vx_2)

        # 예측값 계산
        y_hat = bias_score + latent_score
        return y_hat

    def predict_one(self, year, country_id, item_id):
        x_idx = np.array([year_dict[year], country_dict[country_id], item_dict[item_id]])
        x_data = np.array([1, 1, 1])
        return self.predict(x_idx, x_data) + w0

In [15]:
K = 250
fm1 = FM(num_x, K, data, y, alpha=0.0002, beta=0.007, train_ratio=0.75, iterations=310, tolerance=0.0005, l2_reg=True, verbose=True)
result = fm1.test()   # print(best_iteration, best_RMSE)

Iteration: 10 ; Train RMSE = 0.003844 ; Test RMSE = 0.003079
Iteration: 20 ; Train RMSE = 0.003555 ; Test RMSE = 0.002749
Iteration: 30 ; Train RMSE = 0.003486 ; Test RMSE = 0.002670
Iteration: 40 ; Train RMSE = 0.003460 ; Test RMSE = 0.002642
Iteration: 50 ; Train RMSE = 0.003448 ; Test RMSE = 0.002630
Iteration: 60 ; Train RMSE = 0.003441 ; Test RMSE = 0.002624
Iteration: 70 ; Train RMSE = 0.003437 ; Test RMSE = 0.002621
Iteration: 80 ; Train RMSE = 0.003434 ; Test RMSE = 0.002620
Iteration: 90 ; Train RMSE = 0.003432 ; Test RMSE = 0.002619
Iteration: 100 ; Train RMSE = 0.003431 ; Test RMSE = 0.002618
Iteration: 110 ; Train RMSE = 0.003430 ; Test RMSE = 0.002618
Iteration: 120 ; Train RMSE = 0.003429 ; Test RMSE = 0.002618
Iteration: 130 ; Train RMSE = 0.003428 ; Test RMSE = 0.002618
Iteration: 140 ; Train RMSE = 0.003427 ; Test RMSE = 0.002617
Iteration: 150 ; Train RMSE = 0.003427 ; Test RMSE = 0.002617
Iteration: 160 ; Train RMSE = 0.003426 ; Test RMSE = 0.002617
Iteration: 170 ; 

## FM으로 추천해보기

In [16]:
# 2012년 제외
ratings = ratings_all[ratings_all['year'] != 2012]
# 필요한 컬럼만 가져오기
ratings = ratings[['year', 'country', '품목코드', 'GDP_growth', 'pop_growth', 'preference']]
# column명 변경
ratings.columns=['year', 'country_id', 'item_id', 'GDP_growth', 'pop_growth', 'preference']
# item_id int로 바꾸기
ratings['item_id'] = ratings['item_id'].astype(int)
# country에 고유번호 부여하기
country_to_idx = {v:k for k,v in enumerate(ratings['country_id'].unique())}
idx_to_country = {v:k for k,v in country_to_idx.items()}

# country_id를 고유번호로 바꾸기
ratings['country_id'] = ratings['country_id'].map(country_to_idx)

# 중복제거
ratings = ratings.drop_duplicates(['country_id', 'item_id'], keep='first')
# index 초기화
ratings = ratings.reset_index(drop=True)

rating_matrix = ratings.pivot(values='preference', index='country_id', columns='item_id')

# 품목 이름 가져오기
items_df = pd.read_csv('/home/sjkim/추천공모전/data/item_name-code.csv')
items_df.columns=['title', 'item_id']
items_df = items_df.set_index('item_id')

In [17]:
# 추천하기
def recommender(year, country, n_items=10):
    # 현재 사용자의 모든 아이템에 대한 예상 평점 계산
    predictions = []
    rated_index = rating_matrix.loc[country][rating_matrix.loc[country] > 0].index    # 선호도가 존재하는 품목 확인
    items = rating_matrix.loc[country].drop(rated_index)
    for item in items.index:
        predictions.append(fm1.predict_one(year, country, item))                      # 예상 선호도 계산
    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    recommendations = recommendations.sort_values(ascending=False)[:n_items]          # 예상 선호도가 가장 높은 품목 선택
    recommended_items = items_df.loc[recommendations.index]['title']
    return recommended_items

In [18]:
recommender(2013, 1, 5)  # 2013년도 1번째 나라, 5개 추천

item_id
89                                    선박과 수상 구조물
27    광물성 연료ㆍ광물유(鑛物油)와 이들의 증류물, 역청(瀝靑)물질, 광물성 왁스
72                                            철강
75                                      니켈과 그 제품
41                              원피(모피는 제외한다)와 가죽
Name: title, dtype: object

## Precision@K 계산하기

In [19]:
# precision@k 함수로 만들기
def precision_at_k(year, country, n_items, k):
    hit_df = ratings_all[ratings_all['year'].isin(k)]               # 해당년도(ex.2013) 이후로 k개년 데이터만 가져오기
    hit_df = hit_df[hit_df['country']==idx_to_country[country]]     # 해당 국가 데이터만 가져오기
    c_items = hit_df['품목코드'].unique().astype(int)                 # k년도 동안 해당 국가에 수출된 품목코드 가져오기
    rec_items = recommender(year, country, n_items).index.tolist()  # 해당년도(ex.2013) 기준으로 해당 국가에 추천할 유망 품목 리스트 저장
    hit_items = [i for i in c_items if i in rec_items]              # k년도 동안 해당년도(ex.2013)에 추천한 유망 품목이 수출된적이 있는지 확인
    precision = len(hit_items) / n_items                            # precision@k 계산
    return precision

In [20]:
# 모든 나라에 대해서 precision@k 계산
def precision_at_k_all(n_items, year, k):
    precision = []
    c_len = len(rating_matrix)
    for country in range(0, c_len):
        precision.append(precision_at_k(year, country, n_items, k))
    return sum(precision) / c_len


k = [2014, 2015, 2016, 2017, 2018]
precision_at_k_all(5, 2013, k)

0.5615763546798034