# Load library

In [2]:
pip install implicit

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/bc/07/c0121884722d16e2c5beeb815f6b84b41cbf22e738e4075f1475be2791bc/implicit-0.4.4.tar.gz (1.1MB)
[K     |▎                               | 10kB 24.0MB/s eta 0:00:01[K     |▋                               | 20kB 30.3MB/s eta 0:00:01[K     |▉                               | 30kB 20.7MB/s eta 0:00:01[K     |█▏                              | 40kB 24.2MB/s eta 0:00:01[K     |█▌                              | 51kB 23.5MB/s eta 0:00:01[K     |█▊                              | 61kB 26.2MB/s eta 0:00:01[K     |██                              | 71kB 17.2MB/s eta 0:00:01[K     |██▍                             | 81kB 18.2MB/s eta 0:00:01[K     |██▋                             | 92kB 17.2MB/s eta 0:00:01[K     |███                             | 102kB 17.3MB/s eta 0:00:01[K     |███▎                            | 112kB 17.3MB/s eta 0:00:01[K     |███▌                            | 122kB 17.3MB/s eta

In [3]:
import pandas as pd
import numpy as np
import scipy 
import warnings
from tqdm.notebook import tqdm
import plotnine
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS

warnings.filterwarnings('ignore')

# Load Data

In [5]:
data = pd.read_csv('input_data.csv')

In [6]:
data.head()

Unnamed: 0,emp_no,course_cd,date,code,study_type,course_host,emp_nm,title_nm,jc_nm,org_lvl4_nm,org_lvl2_nm,job_grp_nm,course_nm,category
0,12528,50023796,2018-03-29,3_taken,Off,Internal,장병국,책임,Business Engineering,사업지원팀,개발혁신센터,Business Engineering,CE기본/심화통합,2329
1,12528,50023886,2018-02-21,3_taken,On,Internal,장병국,책임,Business Engineering,사업지원팀,개발혁신센터,Business Engineering,[교재보기]CE기본,2329
2,12632,50023893,2019-08-12,3_taken,On,Internal,송기황,책임,Infra Architecture,Enterprise TA팀,아키텍처담당,IT Professional,[이러닝]AI빅데이터 기본_R활용 기본,2355
3,12632,50023894,2019-03-17,3_taken,On,Internal,송기황,책임,Infra Architecture,Enterprise TA팀,아키텍처담당,IT Professional,[이러닝]AI빅데이터 기본_통계분석 기본,2355
4,12632,50023895,2019-03-17,3_taken,On,Internal,송기황,책임,Infra Architecture,Enterprise TA팀,아키텍처담당,IT Professional,[이러닝]AI빅데이터 기본_머신러닝 기본,2355


In [7]:
# unique한 user, 강의 수
# course_cd와 course_nm의 개수가 다름 -> 같은 강의명이어도 내부/외부 강의 여부에 따라 course_cd가 다름
# 일단 course_cd로 구분
print(data['emp_no'].nunique())
print(data['course_cd'].nunique())
print(data['course_nm'].nunique())

5854
1559
1556


In [56]:
# 동일한 course_nm인데 course_cd가 다른 강의
tmp = data.groupby(['course_nm'])['course_cd'].agg({'nunique','unique'}).reset_index().sort_values(['nunique'],ascending=False)
tmp.query("nunique != 1")

Unnamed: 0,course_nm,unique,nunique
1169,업무효율이 향상되는 엑셀 함수와 매크로(2013),"[50023975, HLSP22892]",2
1285,자바 프로그래밍 입문 강좌 (renew ver.) - 초보부터 개발자 취업까지!!,"[20200228000039, 182835]",2
516,[논어] 시대를 넘어 참 인간을 논하다,"[50021762, HLSP24064]",2


In [9]:
user2idx = {}
for i, l in enumerate(data['emp_no'].unique()):
    user2idx[l] = i
    
course2idx = {}
for i, l in enumerate(data['course_cd'].unique()):
    course2idx[l] = i

In [10]:
idx2user = {i: user for user, i in user2idx.items()}
idx2course = {i: item for item, i in course2idx.items()}

In [11]:
sub_data = data[['emp_no', 'course_cd']].reset_index(drop=True)
useridx = sub_data['useridx'] = data['emp_no'].apply(lambda x: user2idx[x]).values
courseidx = sub_data['courseidx'] = data['course_cd'].apply(lambda x: course2idx[x]).values
rating = np.ones(len(sub_data))

In [12]:
sparse_matrix = scipy.sparse.csr_matrix((rating, (useridx, courseidx)), shape=(len(set(useridx)), len(set(courseidx))))
sparse_matrix

<5854x1559 sparse matrix of type '<class 'numpy.float64'>'
	with 39535 stored elements in Compressed Sparse Row format>

In [13]:
R = sparse_matrix.toarray()

In [14]:
als_model = ALS(factors=20, regularization=0.01, iterations = 100)
als_model.fit(sparse_matrix.T)

GPU training requires factor size to be a multiple of 32. Increasing factors from 20 to 32.


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [15]:
als_model.recommend(0, sparse_matrix)

[(17, 0.18364996),
 (206, 0.12955385),
 (140, 0.106372476),
 (273, 0.09799971),
 (28, 0.08810562),
 (142, 0.07985218),
 (118, 0.07084918),
 (303, 0.063100964),
 (101, 0.057713937),
 (203, 0.054241635)]

In [None]:
seen_list = data.groupby(['emp_no'])['course_cd'].agg({'unique'}).reset_index()

In [57]:
total_rec_list = {}
for user in tqdm(sub_data['useridx'].unique()):
  rec_list = []

  seen = seen_list[seen_list['emp_no'] == idx2user[user]]['unique'].values[0]
  recs = als_model.recommend(user, sparse_matrix, N=50)
  recs = [idx2course[x[0]] for x in recs][0:]

  for rec in recs:
    if rec not in seen:
      rec_list.append(rec)

  total_rec_list[idx2user[user]] = rec_list

HBox(children=(FloatProgress(value=0.0, max=5854.0), HTML(value='')))




In [72]:
pd.DataFrame(seen_list.query("emp_no == 83191")['unique'])

Unnamed: 0,unique
5795,"[50023846, 50023848]"


In [77]:
def rec_course_by_als(data, total_rec_list, emp_no, recommend_num):
  seen_df = data.query("emp_no == @emp_no")[['course_nm']]
  coursecd2nm = data[['course_nm','course_cd']].drop_duplicates()

  rec_by_user = pd.DataFrame(total_rec_list[emp_no], columns={'course_cd'})
  rec_info = pd.merge(rec_by_user, coursecd2nm, how='left', on='course_cd')

  return seen_df, rec_info[:recommend_num]

In [78]:
seen_by_user, rec_for_user = rec_course_by_als(data, total_rec_list, emp_no=83191, recommend_num=10)

In [79]:
seen_by_user

Unnamed: 0,course_nm
39338,딥러닝 실무
39339,시각지능 실무


In [80]:
rec_for_user

Unnamed: 0,course_cd,course_nm
0,50023879,언어지능 실무
1,50024396,AI 실전 Workshop
2,50023897,AI 심화
3,50023818,빅데이터 분석 Workshop
4,50023663,[교재보기]빅데이터 분석
5,77623,김왼손의 미운코딩새끼: 4시간만에 끝내는 파이썬 기초 (전자책 포함)
6,50024002,[교재보기]AI
7,50024390,[이러닝]Python 데이터 분석 기초
8,50024091,[개발언어 Skill-up]빅데이터 분석을 위한 Python
9,50024383,DAP MLDL Workshop
