In [1]:
MECAB_PATH = "./univ_wordclouds_mecab"
OKT_PATH = "./univ_wordclouds_okt"

In [2]:
import pandas as pd
import numpy as np
print(pd.__version__)
print(np.__version__)

1.1.0
1.18.5


In [3]:
# pandas 설정: 모든 row와 모든 column을 출력하게 하고, 각 column 별 출력되는 글자수는 50자 이내
pd.set_option('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 50)

# 위의 설정으로 간단하게 보는 df_specific
df_specific = pd.read_csv("./data_wrangled/df_specific_wrangle.csv", encoding="utf-8")
df_specific.head(2)

Unnamed: 0,대학코드,대학명,gen_info,env_info,food_info,study_info,office_info,facil_info,mhct_info,help_info,etc_info
0,AR000004,Universidad del Salvador,아르헨티나의 대다수 대학교는 한국처럼 캠퍼스가 있는 것이 아니라 도시 내에 몇개의 ...,대학교가 도시 여러곳에 분포하고 있으므로 각기 다르다. 하지만 남녀를 불문하고 부에...,기숙사는 학교에서 제공하지 않고 두가지 옵션이 주어진다. 홈스테이 혹은 게스트 하우...,아르헨티나에서의 대학 수업은 주로 오전 혹은 저녁에 있다. 대부분의 학생들이 일을하...,EL SALVADOR대학교는 국제 교육부가 잘 되있는 학교이다. 아르헨티나 대학교들...,부대시설로는 헬스장이 있다고 알고 있으나 시내와 먼 곳에 있었다. 동아리는 몇가지 ...,딱히 컬쳐쇼크를 받을 만 한 요소는 없었던 것 같다. 하지만 처음에 언어적인 면에서...,"살바도르 대학교 국제처에서 비자문제나 숙소문제(홈스테이경우) 등은 해결해 주고, 특...",
1,AU000019,Australian National University,"ANU has a very big campus, but unlike Yonsei, ...","The city center, called CIVIC, is 20min walk a...","There are many housing options, but I chose to...","In my personal opinions, classes are generally...","Overall, I did not have to ask for help much f...","There were many clubs for you to join at ANU, ...",I did have some culture shock during my first ...,"I was helped a lot, especially during the firs...","Overall, my stay in Australia has been nothing..."


In [4]:
# pandas 설정: 각 column 별 (혹은 한 cell에서) 출력되는 글자수 제한을 없애기
pd.set_option('display.max_colwidth', None)

In [5]:
import re
# removing junks from the string data
def remove_junk(str_data):
    str_data = str_data.replace("\\\\n", "")
    str_data = str_data.replace("\\n", "")
    str_data = str_data.replace("\n", "")
    str_data = str_data.replace("\\", "")
    str_data = str_data.replace("\\t", "")
    str_data = str_data.replace("NaN", "")

    # print(str_data) makes an error. too much of data
    # 한글, english 빼고 전부 날리기
    # str_data = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z|0-9|.,]', ' ', str_data)

    # 한글 빼고 다 날리기
    str_data = re.sub('[^가-힣ㄱ-ㅎㅏ]', ' ', str_data)

    # replace multiple spaces into single space
    str_data = ' '.join(str_data.split())
    return str_data

# accessing university info of given column (= accessing one cell)
def access_univ_info(dataframe, column, univ_code):
    df_row = dataframe.loc[dataframe["대학코드"] == univ_code]
    str_univ_info = df_row[column].to_string(index=False).lstrip()
    str_univ_info = remove_junk(str_univ_info)
    return str_univ_info

In [6]:
info_list = ['gen_info', 'env_info', 'food_info', 'study_info', 'office_info', 'facil_info', 'mhct_info', 'help_info', 'etc_info']

In [7]:
# https://github.com/lovit/soynlp
from soynlp.tokenizer import RegexTokenizer, LTokenizer
from collections import Counter

tokenizer = LTokenizer()
tokenizer

<soynlp.tokenizer._tokenizer.LTokenizer at 0x10e8c1190>

In [8]:
# read stopwords genarated from filter_stopwords() function
stopwords_file = open('stopwords_kr.txt','rt', encoding='UTF8')
stopwords_txt = stopwords_file.read()
stopwords_list = stopwords_txt.strip('').split(', ')

# get human selected stopwords
stopwords_nogada_file = open('stopwords_nogada.txt','rt', encoding='UTF8')
stopwords_nogada_txt = stopwords_nogada_file.read()
stopwords_nogada = stopwords_nogada_txt.strip('').split(', ')
stopwords_list = stopwords_list + stopwords_nogada
print(f"total {len(stopwords_list)} number of stopwords available")
print(stopwords_list[:3], stopwords_list[-4:])

total 1957 number of stopwords available
['있습니다', '수', '있는'] ['캠퍼스', '날씨', '빈칸', '이다\n\n']


In [9]:
def throw_corpus(df_specific, column_data: str, stopwords:list, univ_no: int):
    # get column data
    df_column = df_specific[column_data].fillna("")

    # preprocess all values in column data
    df_process = df_column.apply(remove_junk)
    
    # apply predefined soynlp tokenizer
    df_tokens = df_process.apply(tokenizer.tokenize)
    # print(df_tokens[-2:])

    univ_bow = df_tokens[univ_no]
    univ_name = df_specific["대학명"][univ_no]
    word_count = len(univ_bow)
    if word_count < 500:
        print(f"{univ_name}'s {column_data} has not enough words: {word_count}")
        return False

    elif word_count >= 500:
        print(f"{univ_name}'s {column_data} section has {word_count} words'")
        subtracted_bow_list = [item for item in univ_bow if item not in stopwords_list]
        counted_bow_keywords = Counter(subtracted_bow_list)
        counted_bow_tuple_list = list(Counter(counted_bow_keywords).items())
        zipped_bow_words, bow_frequency_list = map(list, zip(*counted_bow_tuple_list)) 
        corpus = " ".join(zipped_bow_words)
        print(corpus[:100])
        return corpus    

In [10]:
# 그래프에 retina display 적용
%config InlineBackend.figure_format = 'retina'

# 워드클라우드를 위한 나눔고딕 경로 설정
import matplotlib.font_manager as fm
fontpath = './font/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)

In [11]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

backgroundcolor = "white"
width=800
height=600
wordcloud = WordCloud(font_path = fontpath,
                        # stopwords = STOPWORDS, 
                        background_color = backgroundcolor, 
                        max_words= 50,
                        max_font_size= 300,
                        width = width, 
                        height = height)

In [12]:
# tokenizing sentences into nouns, verbs, etc
from konlpy.tag import Mecab

def generate_wordcloud_mecab(df_specific, column_data: str, stopwords_list: list,  corpus: str, univ_no: int, wordcloud):
    mecab_tokenizer = Mecab()

    # tokenization for mecab: http://openuiz.blogspot.com/2016/07/mecab-ko-dic.html
    mecab_pos = ["NNG", "NNP", "VV", "VA", "MAG"]

    mecab_keywords = [
            word
            for word, tag in mecab_tokenizer.pos(corpus, flatten=True)
            if len(word) > 1 and tag in mecab_pos and word not in stopwords_list
        ]
    
    univ_name = df_specific["대학명"][univ_no]
    print(univ_name)
    print(f"number of keywords: {len(mecab_keywords)}")
    print(mecab_keywords[:10])

    # 빈도수 별로 단어를 크게 표시한다
    str_keywords = ' '.join(mecab_keywords)
    mecab_wordcloud = wordcloud.generate(str_keywords)
    # plt.figure(figsize = (15 , 10))
    # plt.imshow(wordcloud)
    # plt.axis("off")
    # plt.show() 

    # save wordcloud png as file
    college_code = df_specific["대학코드"][univ_no]
    wordcloud.to_file(f"{MECAB_PATH}/{column_data}/{college_code}_{column_data}_mecab.png")



In [13]:
# tokenizing sentences into nouns, verbs, etc
from konlpy.tag import Okt

def okt_kor_tokenizer(raw, stopwords):
    okt_tokenizer = Okt()
    # position arguments(or position tags): 내가 뽑아내고 싶은 형태소들
    twitter_pos = ["Noun", "Alpha", "Verb", "Number", "Adverb"]
    twitter_list = [
        word
        # normalize 그랰ㅋㅋ -> 그래ㅋㅋ  # stemming 바뀌나->바뀌다
        for word, tag in okt_tokenizer.pos(raw, norm=True, stem=True)
        if len(word) > 1 and tag in twitter_pos and word not in stopwords
    ]
    return twitter_list


def generate_wordcloud_okt(df_specific, column_data: str, stopwords_list: list,  corpus: str, univ_no: int, wordcloud):
    
    okt_words = okt_kor_tokenizer(corpus, stopwords_list)
    okt_words_counted = Counter(okt_words)
    print("number of counted words:", len(okt_words_counted))
    
    univ_name = df_specific["대학명"][univ_no]
    print(univ_name)
    print(f"number of keywords: {len(okt_words_counted)}")

    # 빈도수 별로 단어를 크게 표시한다
    wordcloud.generate_from_frequencies(frequencies = okt_words_counted) # mecab으로 추출한 명사로 워드클라우드를 그린다.
    plt.figure(figsize = (15 , 10))
    # plt.imshow(wordcloud)
    # plt.axis("off")
    # plt.show() 

    # save wordcloud png as file
    college_code = df_specific["대학코드"][univ_no]
    wordcloud.to_file(f"{OKT_PATH}/{column_data}/{college_code}_{column_data}_okt.png")

In [14]:
column_data = "env_info"
sample_univ_no = 206

In [15]:
sample_corpus = throw_corpus(df_specific, column_data , stopwords_list, univ_no = sample_univ_no)

Public University of Navarre's env_info section has 2050 words'
앞서 말씀드렸듯이 내에는 카페테리아랑 도서관 스포츠센터가 존재하며 스포츠센터 시설은 되어있는 편입니다 테니스 신청하였는데 학생들에게는 스포츠 수업이나 시설 이용료가 싼 가격에 제공


In [16]:
generate_wordcloud_mecab(df_specific, column_data , stopwords_list, sample_corpus, sample_univ_no, wordcloud)

Public University of Navarre
number of keywords: 717
['앞서', '카페테리아', '도서관', '스포츠센터', '존재', '스포츠센터', '시설', '테니스', '신청', '스포츠']


In [17]:
generate_wordcloud_okt(df_specific, column_data , stopwords_list, sample_corpus, sample_univ_no, wordcloud)

number of counted words: 498
Public University of Navarre
number of keywords: 498


<Figure size 1080x720 with 0 Axes>

In [18]:
no_wordcloud_gen_info = []
no_wordcloud_env_info = []
no_wordcloud_food_info = []
no_wordcloud_study_info = []

In [19]:
column_data = "gen_info"

for i in range(470):
    corpus = throw_corpus(df_specific, column_data , stopwords_list, univ_no = i)
    if corpus is False:
        college_code = df_specific["대학코드"][i]
        no_wordcloud_gen_info.append(college_code)
        pass
    else:
        generate_wordcloud_mecab(df_specific, column_data , stopwords_list, corpus, i, wordcloud)
        generate_wordcloud_okt(df_specific, column_data , stopwords_list, corpus, i, wordcloud)

print(no_wordcloud_gen_info)

Universidad del Salvador's gen_info has not enough words: 82
Australian National University's gen_info section has 2114 words'
캔버라에서 중심이 도심 부근에 위치하여 접근성이 캔버라가 시드니 멜버른에 소도시라는 느낌이 강해 선호도가 낮은 듯 한데요 퀸즐랜드가 지망이었습니다 학기를 지내며 안전하고 깨끗하며 
Australian National University
number of keywords: 605
['캔버라', '중심', '도심', '부근', '위치', '접근성', '캔버라', '시드', '멜버른', '소도시']
number of counted words: 437
Australian National University
number of keywords: 437
Bond University's gen_info section has 1636 words'
대학교는 휴양지로 유명한 골드코스트에 위치해있습니다 서퍼스 파라다이스 브로드비치 등등 해변가가 인접해있습니다 테마파크나 동물원 주말마다 많아 바쁜 주말을 보내게 되실거에요 날씨는 
Bond University
number of keywords: 541
['대학교', '휴양지', '골드코스트', '위치', '퍼스', '파라다이스', '브로드', '비치', '해변가', '인접']
number of counted words: 401
Bond University
number of keywords: 401
Curtin University's gen_info has not enough words: 373
Curtin University of Technology's gen_info section has 1408 words'
서호주 퍼스에 대학입니다 퍼스는 서호주에서 도시이며 호주에서 번째로 도시이기는 한국과 비교하면 지방 시도시 정도의 도시입니다 연대보다 캠퍼스도 크고 건물도 많아서 길찾기 힘들 개강
Curtin Unive