In [4]:
import os
import math
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer
import xgboost as xgb


import fasttext
import fasttext.util
fasttext.util.download_model('zh', if_exists='ignore')

'cc.zh.300.bin'

# TODO:
- normalize embedding?
- concat all embedding -> clustering (use ankle/silhouette method to choose best K)
- Kmeans ref:
    - [sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)
    - [silhouette_score](https://medium.com/analytics-vidhya/how-to-determine-the-optimal-k-for-k-means-708505d204eb)

In [6]:
DATA_ROOT = '../hahow/data/'
user_csv_path = os.path.join(DATA_ROOT, 'users.csv')
user_df = pd.read_csv(user_csv_path)
user_df.shape

(130566, 5)

### gender

In [7]:
gender_embed = pd.get_dummies(user_df['gender']).to_numpy()

### occupation
- TODO
    - number of occupation

In [8]:
ft = fasttext.load_model('cc.zh.300.bin')
occu_embed = np.zeros((user_df.shape[0], 300))
occu_embed[:] = np.nan
occu_embed.shape

(130566, 300)

In [9]:
for i, occus in enumerate(user_df['occupation_titles']):
    # not nan
    if type(occus) == str:
        if ',' in occus:
            occus = occus.split(',')
        else:
            occus = [occus]

        word_vec = np.zeros((1, 300))
        word_vec[:] = np.nan
        for occu in occus:
            word_vec += ft.get_word_vector(occu)
    # give (0, ..., 0) to nan 
    else:
        word_vec = np.zeros((1, 300))
        word_vec[:] = np.nan
    occu_embed[i, :] = word_vec

### interest

In [12]:
main_int_embed, sub_int_embed = np.zeros((user_df.shape[0], 300)), np.zeros((user_df.shape[0], 300))
main_int_embed[:], sub_int_embed[:] = np.nan, np.nan
for i, interest in enumerate(user_df['interests']):
    if type(interest) == str:
        # user has serveral interests
        if ',' in interest:
            int_split = interest.split(',')
            main_int_set = set()
            for int_ in int_split:
                main_int = int_.split('_')[0]
                sub_int = int_.split('_')[1]
                main_int_set.update([main_int])
                sub_int_embed[i, :] += ft.get_word_vector(sub_int)

            # mulitple main interest only counts once
            for main_int in main_int_set:
                main_int_embed[i, :] += ft.get_word_vector(main_int)
        
        # user has only one interest
        else:
            main_int = interest.split('_')[0]
            sub_int = interest.split('_')[1]

            main_int_embed[i, :] += ft.get_word_vector(main_int)
            sub_int_embed[i, :] += ft.get_word_vector(sub_int)

### recreation

In [13]:
rec_embed = np.zeros((user_df.shape[0], 300))
rec_embed[:] = np.nan
rec_embed.shape

(130566, 300)

In [14]:
for i, recs in enumerate(user_df['recreation_names']):
    # not nan
    if type(recs) == str:
        if ',' in recs:
            recs = recs.split(',')
        else:
            recs = [recs]

        word_vec = np.zeros((1, 300))
        word_vec[:] = np.nan
        for occu in recs:
            word_vec += ft.get_word_vector(occu)
    # give (0, ..., 0) to nan 
    else:
        word_vec = np.zeros((1, 300))
        word_vec[:] = np.nan
    rec_embed[i, :] = word_vec

In [15]:
user_embed = np.concatenate(
    [gender_embed, occu_embed, main_int_embed, sub_int_embed, rec_embed],
    axis=1
)
# (number of users, 1203)
user_embed.shape

(130566, 1203)

In [16]:
imputer = KNNImputer(n_neighbors=10)
imputer.fit_transform(user_embed)

In [None]:
user_embed

# KNN imputer

# Try clustering

In [49]:
sil = []
kmax = 10

# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
for k in range(2, kmax+1):
    print(f"K: {k}, start clustering")
    kmeans = KMeans(n_clusters = k).fit(user_embed)
    labels = kmeans.labels_
    sil.append(silhouette_score(user_embed, labels, metric = 'euclidean'))
    print(f"K: {k}, finish clustering")

K: 2, start clustering


KeyboardInterrupt: 

# Label + user id preprocess

In [10]:
train_group_csv_path = os.path.join(DATA_ROOT, 'train_group.csv')
train_group_csv = pd.read_csv(train_group_csv_path)
train_group_csv.shape, train_group_csv.head()

((59737, 2),
                     user_id                                    subgroup
 0  5bdecbfffec014002166796a                                          27
 1  5fedf958af850a915c86362c  1 7 19 29 36 49 50 51 59 61 63 64 66 69 72
 2  5fd255c43136a460c6f3f930                                        8 28
 3  5a0bde2aa15b3f001e98429a                               1 59 60 71 79
 4  5fedf8132a0eb0bfab27882b                                          89)

In [11]:
id2embed = dict()
for i, id in enumerate(user_df['user_id']):
    id2embed[id] = user_embed[i]

In [12]:
all_label_cnt = 0
for sub_label in train_group_csv['subgroup']:
    if type(sub_label) == str:
        if ' ' in sub_label:
            sub_label_split = sub_label.split(' ')
            all_label_cnt += len(sub_label_split)
        else:
            all_label_cnt += 1
all_label_cnt

234597

In [13]:
X_train_group = np.zeros(
    (all_label_cnt, user_embed.shape[1]+1)
)
cnt = 0
for i, data in train_group_csv.iterrows():
    sub_label = data['subgroup']
    if type(sub_label) == str:
        user_feat = id2embed[data['user_id']]
        if ' ' in sub_label:
            sub_label_split = sub_label.split(' ')
            for sub_label in sub_label_split:
                X_train_group[cnt, :user_embed.shape[1]] = user_feat
                X_train_group[cnt, user_embed.shape[1]:] = int(sub_label)
                cnt += 1
        else:
            X_train_group[cnt, :user_embed.shape[1]] = user_feat
            X_train_group[cnt, user_embed.shape[1]:] = int(sub_label)
            cnt += 1      

In [14]:
X_train_group, y_train_group = \
    X_train_group[:, :user_embed.shape[1]], X_train_group[:, user_embed.shape[1]:]

In [15]:
X_train_group.shape, y_train_group.shape

((234597, 903), (234597, 1))

In [110]:
clf = LogisticRegression(random_state=0).fit(
    X_train_group, y_train_group.flatten()
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Try XGBoost

In [16]:
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob", 
    random_state=42)
# y should start in zero 
xgb_model.fit(X_train_group, y_train_group-1)

# Import test data

In [None]:
test_seen_csv_path = os.path.join(DATA_ROOT, './test/test_seen_group.csv')
test_unseen_csv_path = os.path.join(DATA_ROOT, './test/test_unseen_group.csv')
test_seen_df = pd.read_csv(test_seen_csv_path)
test_unseen_df = pd.read_csv(test_unseen_csv_path)

In [None]:
X_seen = np.zeros((test_seen_df.shape[0], user_embed.shape[1]))
X_unseen = np.zeros((test_unseen_df.shape[0], user_embed.shape[1]))

In [None]:
for i, id in enumerate(test_seen_df['user_id']):
    X_seen[i, :] = id2embed[id]

for i, id in enumerate(test_unseen_df['user_id']):
    X_unseen[i, :] = id2embed[id]

In [None]:
top_k = 50
pred_seen = xgb_model.predict_proba(X_seen)
# pred_seen = clf.predict_proba(X_seen)
pred_top_50 = pred_seen.argsort()[:, ::-1] \
    [:, :50] + 1 # subgroup id start from 1

In [None]:
pred_top_50_list = pred_top_50.tolist()
preds_seen = [' '.join(
    map(str, pred)
    ) for pred in pred_top_50_list]

In [None]:
preds_seen_df = pd.DataFrame(
    {
        'user_id': test_seen_df['user_id'],
        'subgroup': preds_seen
    }
)
preds_seen_df.to_csv('./pred_record/1217/seen_group_1217_2.csv', index=False)

In [None]:
top_k = 50
pred_unseen = xgb_model.predict_proba(X_unseen)
# pred_unseen = clf.predict_proba(X_unseen)
pred_top_50 = pred_unseen.argsort()[:, ::-1] \
    [:, :50] + 1 # subgroup id start from 1

In [None]:
pred_top_50_list = pred_top_50.tolist()
preds_unseen = [' '.join(
    map(str, pred)
    ) for pred in pred_top_50_list]
preds_unseen_df = pd.DataFrame(
    {
        'user_id': test_unseen_df['user_id'],
        'subgroup': preds_unseen
    }
)
preds_unseen_df.to_csv('./pred_record/1217/unseen_group_1217_2.csv', index=False)