In [327]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from itertools import combinations
from scipy import sparse
from scipy.sparse.linalg import svds
import implicit

import random
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

import matplotlib.pyplot as plt
import json

In [2]:
import math
# ground_truth: list of items ordered by time
def nDCG_Time(ground_truth, _recList):
    rec_num = len(_recList) # topK
    # ground_truth is already sorted by time
    idealOrder = ground_truth
    idealDCG = 0.0
    for j in range(min(rec_num, len(idealOrder))):
        idealDCG += ((math.pow(2.0, len(idealOrder) - j) - 1) / math.log(2.0 + j))

    recDCG = 0.0
    for j in range(rec_num):
        item = _recList[j]
        if item in ground_truth:
            rank = len(ground_truth) - ground_truth.index(item) # why ground truth?
            recDCG += ((math.pow(2.0, rank) - 1) / math.log(1.0 + j + 1))

    return (recDCG / idealDCG)


def Recall(_test_set, _recList):
    hit = len(set(_recList).intersection(set(_test_set)))
    # return hit / float(len(_test_set))
    return hit / min(float(len(_test_set)), float(len(_recList)))

def Precision(_test_set, _recList):
    hit = len(set(_recList).intersection(set(_test_set)))
    return hit / float(len(_recList))

In [3]:
def ugf(scores):
    return np.mean([abs(i[0] - i[1]) for i in combinations(scores, 2)])

In [4]:
df = pd.read_csv('./data/insurance/Train.csv', header=0, sep=',')

In [5]:
df.head()

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,4WKQSBB,1/2/2019,F,M,1987,1X1H,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CP5S02H,1/6/2019,F,M,1981,UAOD,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2YKDILJ,1/6/2013,M,U,1991,748L,QZYX,90QI,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2S9E81J,1/8/2019,M,M,1990,1X1H,BP09,56SI,0,0,...,0,0,0,1,0,0,0,0,0,0
4,BHDYVFT,1/8/2019,M,M,1990,748L,NO3L,T4MS,0,0,...,0,0,0,0,0,0,1,1,0,0


In [9]:
occupation_category_list = df['occupation_category_code'].unique()

In [12]:
occupation_category_list.size

6

In [36]:
# group users based on occupation categories
user_groups = [df[df['occupation_category_code'] == occupation_category_list[i]].index.to_numpy() for i in range(occupation_category_list.size)]

In [37]:
user_groups[0]

array([    0,     1,     4, ..., 29127, 29129, 29130])

In [38]:
for i in range(len(user_groups)):
    print(f"Number of users in group {i}: {user_groups[i].shape}")

Number of users in group 0: (18619,)
Number of users in group 1: (5552,)
Number of users in group 2: (3655,)
Number of users in group 3: (576,)
Number of users in group 4: (628,)
Number of users in group 5: (102,)


In [44]:
item_names = df.columns[-21:]

In [45]:
item_names.size

21

In [48]:
rating_matrix = df[item_names].to_numpy()
rating_matrix.shape

(29132, 21)

In [153]:
# split to train/test
test_users = [np.random.choice(user_groups[i], 50, replace=False) for i in range(len(user_groups))]

In [155]:
test_item_ids = []

In [156]:
train_mat = rating_matrix.copy()
for i in range(len(test_users)):
    test_item_ids_i = []
    for user_id in test_users[i]:
        purchased_item_ids = rating_matrix[user_id].nonzero()[0]
        test_item_id = purchased_item_ids[0]
        train_mat[user_id, test_item_id] = 0
        test_item_ids_i.append(test_item_id)
    test_item_ids.append(test_item_ids_i)

### Collaborative filtering

In [157]:
sparse_train_mat = sparse.csr_matrix(train_mat)

In [165]:
mf = implicit.als.AlternatingLeastSquares(factors=50, regularization=1, alpha=1.0)
mf.fit(sparse_train_mat)

  0%|          | 0/15 [00:00<?, ?it/s]

In [166]:
acc_list = []
for i in range(len(test_users)):
    total_recs = 0
    right_recs = 0
    for j, user_id in enumerate(test_users[i]):
        top_item_id = mf.recommend(user_id, sparse_train_mat[user_id], N=1, filter_already_liked_items=True)[0][0]
        test_item_id = test_item_ids[i][j]
        if top_item_id == test_item_id:
            right_recs += 1
        total_recs += 1
    acc = float(right_recs)/float(total_recs)
    acc_list.append(acc)
    print(f"For group {i}: {right_recs}/{total_recs} correct recommendations, accuracy: {acc}.")

For group 0: 43/50 correct recommendations, accuracy: 0.86.
For group 1: 41/50 correct recommendations, accuracy: 0.82.
For group 2: 49/50 correct recommendations, accuracy: 0.98.
For group 3: 46/50 correct recommendations, accuracy: 0.92.
For group 4: 45/50 correct recommendations, accuracy: 0.9.
For group 5: 48/50 correct recommendations, accuracy: 0.96.


In [167]:
ugf(acc_list)

0.07466666666666667

### Content-based filtering

In [168]:
df.head()

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,4WKQSBB,1/2/2019,F,M,1987,1X1H,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CP5S02H,1/6/2019,F,M,1981,UAOD,2A7I,T4MS,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2YKDILJ,1/6/2013,M,U,1991,748L,QZYX,90QI,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2S9E81J,1/8/2019,M,M,1990,1X1H,BP09,56SI,0,0,...,0,0,0,1,0,0,0,0,0,0
4,BHDYVFT,1/8/2019,M,M,1990,748L,NO3L,T4MS,0,0,...,0,0,0,0,0,0,1,1,0,0


In [183]:
num_df = df.copy()

In [323]:
num_df['sex'] = pd.Categorical(df['sex'], ordered=True).codes
num_df['marital_status'] = pd.Categorical(df['marital_status'], ordered=True).codes
num_df['branch_code'] = pd.Categorical(df['branch_code'], ordered=True).codes
num_df['occupation_code'] = pd.Categorical(df['occupation_code'], ordered=True).codes
num_df['occupation_category_code'] = pd.Categorical(df['occupation_category_code'], ordered=True).codes

In [324]:
num_df.head()

Unnamed: 0,ID,join_date,sex,marital_status,birth_year,branch_code,occupation_code,occupation_category_code,P5DA,RIBP,...,AHXO,BSTQ,FM3X,K6QO,QBOL,JWFN,JZ9D,J9JW,GHYX,ECY3
0,4WKQSBB,1/2/2019,0,1,1987,0,17,5,0,0,...,0,0,0,1,0,0,0,0,0,0
1,CP5S02H,1/6/2019,0,1,1981,11,17,5,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2YKDILJ,1/6/2013,1,5,1991,3,176,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2S9E81J,1/8/2019,1,1,1990,0,76,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,BHDYVFT,1/8/2019,1,1,1990,3,148,5,0,0,...,0,0,0,0,0,0,1,1,0,0


In [325]:
train_X = num_df[['sex','marital_status', 'birth_year', 'occupation_code','occupation_category_code']].to_numpy()
train_Y = num_df['P5DA'].to_numpy()

In [721]:
np.array(test_item_ids[3])

array([7, 7, 7, 8, 7, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 0, 7, 7, 7, 7])

In [700]:
from sklearn.ensemble import RandomForestClassifier
simple_forest = RandomForestClassifier(
    max_features='sqrt',
    max_depth=10,
    n_estimators=100,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=101,
    class_weight={0: 1, 1:500},
    criterion='gini')
     

In [701]:
simple_forest.fit(train_X, train_Y)

In [702]:
len(simple_forest.predict(train_X).nonzero()[0])

502

In [703]:
len([i for i in simple_forest.predict(train_X).nonzero()[0] if i in train_Y.nonzero()[0]])

39