# Load library

In [1]:
#pip install implicit

In [2]:
import pandas as pd
import numpy as np
import scipy 
import warnings
from tqdm.notebook import tqdm
import plotnine
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS

warnings.filterwarnings('ignore')

# Load Data

In [4]:
data = pd.read_csv('../Data/LearningPlus/input_data.csv')

In [5]:
data.head()

Unnamed: 0,emp_no,course_cd,date,code,study_type,course_host,emp_nm,title_nm,jc_nm,org_lvl4_nm,org_lvl2_nm,job_grp_nm,course_nm,category
0,12528,50023796,2018-03-29,3_taken,Off,Internal,장병국,책임,Business Engineering,사업지원팀,개발혁신센터,Business Engineering,CE기본/심화통합,2329
1,12528,50023886,2018-02-21,3_taken,On,Internal,장병국,책임,Business Engineering,사업지원팀,개발혁신센터,Business Engineering,[교재보기]CE기본,2329
2,12632,50023893,2019-08-12,3_taken,On,Internal,송기황,책임,Infra Architecture,Enterprise TA팀,아키텍처담당,IT Professional,[이러닝]AI빅데이터 기본_R활용 기본,2355
3,12632,50023894,2019-03-17,3_taken,On,Internal,송기황,책임,Infra Architecture,Enterprise TA팀,아키텍처담당,IT Professional,[이러닝]AI빅데이터 기본_통계분석 기본,2355
4,12632,50023895,2019-03-17,3_taken,On,Internal,송기황,책임,Infra Architecture,Enterprise TA팀,아키텍처담당,IT Professional,[이러닝]AI빅데이터 기본_머신러닝 기본,2355


In [6]:
# unique한 user, 강의 수
# course_cd와 course_nm의 개수가 다름 -> 같은 강의명이어도 내부/외부 강의 여부에 따라 course_cd가 다름
# 일단 course_cd로 구분
print(data['emp_no'].nunique())
print(data['course_cd'].nunique())
print(data['course_nm'].nunique())

5854
1559
1556


In [7]:
# 동일한 course_nm인데 course_cd가 다른 강의
tmp = data.groupby(['course_nm'])['course_cd'].agg({'nunique','unique'}).reset_index().sort_values(['nunique'],ascending=False)
tmp.query("nunique != 1")

Unnamed: 0,course_nm,unique,nunique
1169,업무효율이 향상되는 엑셀 함수와 매크로(2013),"[50023975, HLSP22892]",2
1285,자바 프로그래밍 입문 강좌 (renew ver.) - 초보부터 개발자 취업까지!!,"[20200228000039, 182835]",2
516,[논어] 시대를 넘어 참 인간을 논하다,"[50021762, HLSP24064]",2


# Modeling

In [8]:
user2idx = {}
for i, l in enumerate(data['emp_no'].unique()):
    user2idx[l] = i
    
course2idx = {}
for i, l in enumerate(data['course_cd'].unique()):
    course2idx[l] = i

In [9]:
idx2user = {i: user for user, i in user2idx.items()}
idx2course = {i: item for item, i in course2idx.items()}

In [10]:
sub_data = data[['emp_no', 'course_cd']].reset_index(drop=True)
useridx = sub_data['useridx'] = data['emp_no'].apply(lambda x: user2idx[x]).values
courseidx = sub_data['courseidx'] = data['course_cd'].apply(lambda x: course2idx[x]).values
rating = np.ones(len(sub_data))

In [11]:
sparse_matrix = scipy.sparse.csr_matrix((rating, (useridx, courseidx)), shape=(len(set(useridx)), len(set(courseidx))))
sparse_matrix

<5854x1559 sparse matrix of type '<class 'numpy.float64'>'
	with 39535 stored elements in Compressed Sparse Row format>

In [12]:
R = sparse_matrix.toarray()

In [14]:
als_model = ALS(factors=20, regularization=0.01, iterations = 100)
als_model.fit(sparse_matrix.T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [15]:
als_model.recommend(0, sparse_matrix)

[(14, 0.44474617),
 (23, 0.17814311),
 (17, 0.17108533),
 (45, 0.17026107),
 (126, 0.11683133),
 (16, 0.11320813),
 (303, 0.11122804),
 (224, 0.09766561),
 (15, 0.09471109),
 (28, 0.08964201)]

In [16]:
seen_list = data.groupby(['emp_no'])['course_cd'].agg({'unique'}).reset_index()

In [17]:
total_rec_list = {}
for user in tqdm(sub_data['useridx'].unique()):
  rec_list = []

  seen = seen_list[seen_list['emp_no'] == idx2user[user]]['unique'].values[0]
  recs = als_model.recommend(user, sparse_matrix, N=50)
  recs = [idx2course[x[0]] for x in recs][0:]

  for rec in recs:
    if rec not in seen:
      rec_list.append(rec)

  total_rec_list[idx2user[user]] = rec_list

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5854.0), HTML(value='')))




In [19]:
def rec_course_by_als(data, total_rec_list, emp_no, recommend_num):
  seen_df = data.query("emp_no == @emp_no")[['course_nm']]
  coursecd2nm = data[['course_nm','course_cd']].drop_duplicates()

  rec_by_user = pd.DataFrame(total_rec_list[emp_no], columns={'course_cd'})
  rec_info = pd.merge(rec_by_user, coursecd2nm, how='left', on='course_cd')

  return seen_df, rec_info[:recommend_num]

In [20]:
seen_by_user, rec_for_user = rec_course_by_als(data, total_rec_list, emp_no=83191, recommend_num=10)

In [21]:
seen_by_user

Unnamed: 0,course_nm
39338,딥러닝 실무
39339,시각지능 실무


In [22]:
rec_for_user

Unnamed: 0,course_cd,course_nm
0,50023879,언어지능 실무
1,50024396,AI 실전 Workshop
2,50023897,AI 심화
3,50023663,[교재보기]빅데이터 분석
4,50024002,[교재보기]AI
5,77623,김왼손의 미운코딩새끼: 4시간만에 끝내는 파이썬 기초 (전자책 포함)
6,50023818,빅데이터 분석 Workshop
7,50024390,[이러닝]Python 데이터 분석 기초
8,50024091,[개발언어 Skill-up]빅데이터 분석을 위한 Python
9,50023792,[개발언어 Skill-up]Algorithm 개발언어


# class로 만들기

In [45]:
class recommend_course_by_als():
    
    def __init__(self, data):
        self._data = data
        
        
    def fit(self, factors, regularization, iterations):
        # user와 item을 index로 바꾸고 원복하기 위한 dict 생성
        self._user2idx = {}
        for i, l in enumerate(self._data['emp_no'].unique()):
            self._user2idx[l] = i
            
        self._course2idx = {}
        for i, l in enumerate(self._data['course_cd'].unique()):
            self._course2idx[l] = i

        self._idx2user = {i: user for user, i in self._user2idx.items()}
        self._idx2course = {i: item for item, i in self._course2idx.items()}
        
        # emp_no와 course_cd를 이용해 sparse matrix 생성
        self._sub_data = self._data[['emp_no', 'course_cd']].reset_index(drop=True)
        useridx = self._sub_data['useridx'] = self._data['emp_no'].apply(lambda x: self._user2idx[x]).values
        courseidx = self._sub_data['courseidx'] = self._data['course_cd'].apply(lambda x: self._course2idx[x]).values
        rating = np.ones(len(self._sub_data))
        self._sparse_matrix = scipy.sparse.csr_matrix((rating, (useridx, courseidx)), shape=(len(set(useridx)), len(set(courseidx))))
        R = self._sparse_matrix.toarray()
        
        # ALS 인스턴스 생성 후 학습
        self._als_model = ALS(factors=factors, regularization=regularization, iterations = iterations)
        self._als_model.fit(sparse_matrix.T)
        
    def recommend(self, emp_no, recommend_num):
        seen_list = self._data.groupby(['emp_no'])['course_cd'].agg({'unique'}).reset_index()
        
        total_rec_list = {}
        for user in tqdm(self._sub_data['useridx'].unique()):
          rec_list = []

          seen = seen_list[seen_list['emp_no'] == self._idx2user[user]]['unique'].values[0]
          recs = self._als_model.recommend(user, self._sparse_matrix, N=recommend_num)
          recs = [self._idx2course[x[0]] for x in recs][0:]

          for rec in recs:
            if rec not in seen:
              rec_list.append(rec)

          total_rec_list[self._idx2user[user]] = rec_list
        
        seen_df = self._data.query("emp_no == @emp_no")[['course_nm']]
        coursecd2nm = self._data[['course_nm','course_cd']].drop_duplicates()

        rec_by_user = pd.DataFrame(total_rec_list[emp_no], columns={'course_cd'})
        rec_info = pd.merge(rec_by_user, coursecd2nm, how='left', on='course_cd')

        return seen_df, rec_info[:recommend_num]

In [46]:
test = recommend_course_by_als(data)

In [47]:
test.fit(factors=20, regularization=0.01, iterations = 100)

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [48]:
seen_df, rec_df = test.recommend(83191, 10)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5854.0), HTML(value='')))




In [49]:
seen_df

Unnamed: 0,course_nm
39338,딥러닝 실무
39339,시각지능 실무


In [50]:
rec_df

Unnamed: 0,course_cd,course_nm
0,50023879,언어지능 실무
1,50024396,AI 실전 Workshop
2,50023897,AI 심화
3,50023663,[교재보기]빅데이터 분석
4,77623,김왼손의 미운코딩새끼: 4시간만에 끝내는 파이썬 기초 (전자책 포함)
5,50024002,[교재보기]AI
6,50023818,빅데이터 분석 Workshop
7,50024390,[이러닝]Python 데이터 분석 기초
8,50023792,[개발언어 Skill-up]Algorithm 개발언어
9,50024091,[개발언어 Skill-up]빅데이터 분석을 위한 Python
