In [1]:
import pymysql
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import warnings
# import nltk
# nltk.download('punkt')

warnings.filterwarnings("ignore")
vectorizer = TfidfVectorizer()
pd.set_option('display.max_columns', None)

# DB 연결 - db 정보 업데이트 필요
connection = pymysql.connect(host='13.125.49.128', port=53710, user='root', password='asd456',
                       db='chatbotBase', charset='utf8')

# db에서 데이터 추출하여 데이터프레임으로 저장
with connection.cursor() as cursor:
    sql = "select uuid, url, course_data, course, categorie, keyword, course_info, review from chat_bot_002"
    cursor.execute(sql)  
    rows = cursor.fetchall()

    df = pd.DataFrame(rows)
#     print(df.head())

connection.close()

# 전체 컬럼명
['uuid', 'url', 'course_data', 'categorie_data', 'keyword_data', 'course_info_data', 'grade_data', 'stu_num_data', 'teacher_data', 'playtime_data', 'level_data', 'price_data', 'review_data', 'dibs_num_data', 'teacher_info_data', 'course', 'categorie', 'keyword', 'course_info', 'grade', 'stu_num', 'teacher', 'playtime', 'level', 'price', 'review', 'dibs_num', 'teacher_info', 'ori_data']

('uuid', 'varchar(100)', 'NO', 'PRI', None, '')
('url', 'varchar(100)', 'YES', '', None, '')
('course_data', 'varchar(300)', 'YES', '', None, '')
('categorie_data', 'varchar(300)', 'YES', '', None, '')
('keyword_data', 'varchar(300)', 'YES', '', None, '')
('course_info_data', 'text', 'YES', '', None, '')
('grade_data', 'tinyint(4)', 'YES', '', None, '')
('stu_num_data', 'int(11)', 'YES', '', None, '')
('teacher_data', 'varchar(50)', 'YES', '', None, '')
('playtime_data', 'float', 'YES', '', None, '')
('level_data', 'int(11)', 'YES', '', None, '')
('price_data', 'int(11)', 'YES', '', None, '')
('review_data', 'text', 'YES', '', None, '')
('dibs_num_data', 'int(11)', 'YES', '', None, '')
('teacher_info_data', 'varchar(2000)', 'YES', '', None, '')
('course', 'varchar(300)', 'YES', '', None, '')
('categorie', 'varchar(300)', 'YES', '', None, '')
('keyword', 'varchar(300)', 'YES', '', None, '')
('course_info', 'text', 'YES', '', None, '')
('grade', 'float', 'YES', '', None, '')
('stu_num', 'int(11)', 'YES', '', None, '')
('teacher', 'varchar(50)', 'YES', '', None, '')
('playtime', 'float', 'YES', '', None, '')
('level', 'int(11)', 'YES', '', None, '')
('price', 'int(11)', 'YES', '', None, '')
('review', 'longtext', 'YES', '', None, '')
('dibs_num', 'int(11)', 'YES', '', None, '')
('teacher_info', 'varchar(2000)', 'YES', '', None, '')
('ori_data', 'longtext', 'YES', '', None, '')
('course_id', 'int(11)', 'YES', '', None, '')

In [2]:
# 데이터프레임 컬럼명 변경
df.columns = ['uuid', 'url', 'course_data', 'course', 'categorie', 'keyword', 'course_info', 'review']

In [3]:
df = df.dropna(subset=['review'], how='any', axis=0)

In [4]:
# 계획
# 강좌명, 카테고리, 키워드를 각각 df3, df4, df5, df6에 담은 후 함수를 통해 입력된 문장과 유사도가 높은 순서대로 내림차순 해주고 
# 4 곳에서 각각의 랭킹 점수를 매겨서 합산 점수 상위 5개의 강의 인덱스를 반환한다

In [5]:
# 강의평(review) doc2vec 모델 생성
sentences = df['review'].tolist()

model = gensim.models.doc2vec.Doc2Vec(vector_size=30, min_count=2, epochs=80)
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(sentences)]
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=80)
# model.save("d2v.model") - 모델 저장
# model = Doc2Vec.load("d2v.model") - 모델 불러오기

df['doc2vec_score'] = 0

In [6]:
# input sentence와 제일 유사한 아이템 내림차순 정렬 함수
def get_similar_documents(dataframe, input_sentence):
    # input sentence 벡터화
    input_vector = vectorizer.transform([input_sentence])

    # input vector와 입력된 DataFrame의 vectors 값 유사도 계산
    dit = dict()
    for idx, x in enumerate(cosine_similarity(input_vector.toarray(), dataframe['vectors'].tolist())[0]):
        dit[idx] = x
        
    # 유사도 높은 순으로 데이터프레임 내림차순 정렬
    dataframe = dataframe.iloc[[x[0] for x in sorted(dit.items(), key=lambda x : x[1], reverse=True)]]
    return dataframe

In [7]:
# 랭킹 점수 부여 함수
def dataframe_rank(df):
    df['rank'] = 0
    for i in range(len(df)):
        df['rank'][i] = len(df) - i
    return df

In [8]:
# get_similar_documents 함수 적용
# df3: 강좌명, df4: 카테고리, df5: 키워드, df6: 강의 정보

def recommendation(input_sentence):
    
    input_tokens = word_tokenize(input_sentence.lower())
    input_vec = model.infer_vector(input_tokens)
    most_similar = model.dv.most_similar([input_vec], topn=len(df))

    for i in range(len(df)):
        df['doc2vec_score'][int(most_similar[i][0])] = (most_similar[i][1]) * len(df)
    
    df3 = df[['uuid', 'course']].copy()
    df4 = df[['uuid', 'categorie']].copy()
    df5 = df[['uuid', 'keyword']].copy()
    df6 = df[['uuid', 'course_info']].copy()
    
    # TfidfVectorizer 로 문서 벡터화
    vectors4 = vectorizer.fit_transform(df6['course_info']) # 1360
    vectors = vectorizer.transform(df3['course']) # 1172
    vectors2 = vectorizer.transform(df4['categorie']) # 58
    vectors3 = vectorizer.transform(df5['keyword']) # 412
    
    # 데이터프레임에 벡터 보관용 컬럼 추가
    df3['vectors'] = list(vectors.toarray())
    df4['vectors'] = list(vectors2.toarray())
    df5['vectors'] = list(vectors3.toarray())
    df6['vectors'] = list(vectors4.toarray())
    
    df3 = get_similar_documents(df3, input_sentence)
    df4 = get_similar_documents(df4, input_sentence)
    df5 = get_similar_documents(df5, input_sentence)
    df6 = get_similar_documents(df6, input_sentence)

    df3.reset_index(drop=True, inplace=True)
    df4.reset_index(drop=True, inplace=True)
    df5.reset_index(drop=True, inplace=True)
    df6.reset_index(drop=True, inplace=True)
    
    # 랭킹 점수 부여 함수 적용
    df3 = dataframe_rank(df3)
    df4 = dataframe_rank(df4)
    df5 = dataframe_rank(df5)
    df6 = dataframe_rank(df6)
    
    # 강좌명, 카테고리, 키워드, 강의 정보 랭킹 점수 합산

    df3.sort_values(by='uuid', inplace=True)
    df4.sort_values(by='uuid', inplace=True)
    df5.sort_values(by='uuid', inplace=True)
    df6.sort_values(by='uuid', inplace=True)

    df3.reset_index(drop=True, inplace=True)
    df4.reset_index(drop=True, inplace=True)
    df5.reset_index(drop=True, inplace=True)
    df6.reset_index(drop=True, inplace=True)

    df_ranked = df[['uuid', 'url', 'course_data']].copy()
    df_ranked['rank_course'] = df3['rank']
    df_ranked['rank_category'] = df4['rank']
    df_ranked['rank_keyword'] = df5['rank']
    df_ranked['rank_course_info'] = df6['rank']

    df_ranked['rank_sum'] = df_ranked['rank_course'] + df_ranked['rank_category'] 
    + df_ranked['rank_keyword'] + df_ranked['rank_course_info'] + df['doc2vec_score']
    df_ranked.sort_values(by='rank_sum', ascending=False, inplace=True)
       
    return df_ranked[['uuid', 'course_data', 'url']][:5]

In [9]:
input_sentence = "파이썬 데이터 사이언스 강의 추천해줘"
recommendation(input_sentence)

Unnamed: 0,uuid,course_data,url
90,0cbd767cad2c8ca345041f9842475d2fbb3a7cfd1b3d4b...,실전 데이터 사이언스 Part1 파이썬 입문,https://www.inflearn.com/course/핵심-파이썬
144,14cd21a4fabd111a0a92c4e919e4621c06acabe2fdc5b3...,내 집 마련 파이썬,https://www.inflearn.com/course/내집-파이썬
208,1f0ab7da2aa40a0fde7d30b33f9f85f556bfe25dc78771...,데이터 사이언스 입문 부트캠프,https://www.inflearn.com/course/데이터-사이언스-입문캠프
30,03b26e73b5f345b10564722bcc63199d6ab0837dd515bc...,파이썬을 활용한 데이터분석과 IT보안,https://www.inflearn.com/course/python-data-se...
80,0b67f022cbb8cfa61333d998aa61bc935cfcec0d9fb97b...,파이썬 머신러닝,https://www.inflearn.com/course/파이썬-머신러닝-마소캠
