In [1]:
MECAB_PATH = "./univ_wordclouds_mecab"
OKT_PATH = "./univ_wordclouds_okt"

In [2]:
import pandas as pd
import numpy as np
print(pd.__version__)
print(np.__version__)

1.1.0
1.18.5


In [3]:
# pandas 설정: 모든 row와 모든 column을 출력하게 하고, 각 column 별 출력되는 글자수는 50자 이내
pd.set_option('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 50)

# 위의 설정으로 간단하게 보는 df_specific
df_specific = pd.read_csv("./data_wrangled/df_specific_wrangle.csv", encoding="utf-8")
df_specific.head(2)

Unnamed: 0,대학코드,대학명,gen_info,env_info,food_info,study_info,office_info,facil_info,mhct_info,help_info,etc_info
0,AR000004,Universidad del Salvador,아르헨티나의 대다수 대학교는 한국처럼 캠퍼스가 있는 것이 아니라 도시 내에 몇개의 ...,대학교가 도시 여러곳에 분포하고 있으므로 각기 다르다. 하지만 남녀를 불문하고 부에...,기숙사는 학교에서 제공하지 않고 두가지 옵션이 주어진다. 홈스테이 혹은 게스트 하우...,아르헨티나에서의 대학 수업은 주로 오전 혹은 저녁에 있다. 대부분의 학생들이 일을하...,EL SALVADOR대학교는 국제 교육부가 잘 되있는 학교이다. 아르헨티나 대학교들...,부대시설로는 헬스장이 있다고 알고 있으나 시내와 먼 곳에 있었다. 동아리는 몇가지 ...,딱히 컬쳐쇼크를 받을 만 한 요소는 없었던 것 같다. 하지만 처음에 언어적인 면에서...,"살바도르 대학교 국제처에서 비자문제나 숙소문제(홈스테이경우) 등은 해결해 주고, 특...",
1,AU000019,Australian National University,"ANU has a very big campus, but unlike Yonsei, ...","The city center, called CIVIC, is 20min walk a...","There are many housing options, but I chose to...","In my personal opinions, classes are generally...","Overall, I did not have to ask for help much f...","There were many clubs for you to join at ANU, ...",I did have some culture shock during my first ...,"I was helped a lot, especially during the firs...","Overall, my stay in Australia has been nothing..."


In [4]:
# pandas 설정: 각 column 별 (혹은 한 cell에서) 출력되는 글자수 제한을 없애기
pd.set_option('display.max_colwidth', None)

In [5]:
import re
# removing junks from the string data
def remove_junk(str_data):
    str_data = str_data.replace("\\\\n", "")
    str_data = str_data.replace("\\n", "")
    str_data = str_data.replace("\n", "")
    str_data = str_data.replace("\\", "")
    str_data = str_data.replace("\\t", "")
    str_data = str_data.replace("NaN", "")

    # print(str_data) makes an error. too much of data
    # 한글, english 빼고 전부 날리기
    # str_data = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z|0-9|.,]', ' ', str_data)

    # 한글 빼고 다 날리기
    str_data = re.sub('[^가-힣ㄱ-ㅎㅏ]', ' ', str_data)

    # replace multiple spaces into single space
    str_data = ' '.join(str_data.split())
    return str_data

# accessing university info of given column (= accessing one cell)
def access_univ_info(dataframe, column, univ_code):
    df_row = dataframe.loc[dataframe["대학코드"] == univ_code]
    str_univ_info = df_row[column].to_string(index=False).lstrip()
    str_univ_info = remove_junk(str_univ_info)
    return str_univ_info

In [6]:
info_list = ['gen_info', 'env_info', 'food_info', 'study_info', 'office_info', 'facil_info', 'mhct_info', 'help_info', 'etc_info']

In [7]:
# https://github.com/lovit/soynlp
from soynlp.tokenizer import RegexTokenizer, LTokenizer
from collections import Counter

tokenizer = LTokenizer()
tokenizer

<soynlp.tokenizer._tokenizer.LTokenizer at 0x1122a34c0>

In [26]:
# read stopwords genarated from filter_stopwords() function
stopwords_file = open('stopwords_kr.txt','rt', encoding='UTF8')
stopwords_txt = stopwords_file.read()
stopwords_list = stopwords_txt.strip('').split(', ')

# get human selected stopwords
stopwords_nogada_file = open('stopwords_nogada.txt','rt', encoding='UTF8')
stopwords_nogada_txt = stopwords_nogada_file.read()
stopwords_nogada = stopwords_nogada_txt.strip('').split(', ')
stopwords_list = stopwords_list + stopwords_nogada
print(f"total {len(stopwords_list)} number of stopwords available")
print(stopwords_list[:3], stopwords_list[-4:])

total 1983 number of stopwords available
['있습니다', '수', '있는'] ['레스토랑', '지역', '식당', '다니다']


In [9]:
def throw_corpus(df_specific, column_data: str, stopwords:list, univ_no: int):
    # get column data
    df_column = df_specific[column_data].fillna("")

    # preprocess all values in column data
    df_process = df_column.apply(remove_junk)
    
    # apply predefined soynlp tokenizer
    df_tokens = df_process.apply(tokenizer.tokenize)
    # print(df_tokens[-2:])

    univ_bow = df_tokens[univ_no]
    univ_name = df_specific["대학명"][univ_no]
    word_count = len(univ_bow)
    if word_count < 500:
        print(f"{univ_name}'s {column_data} has not enough words: {word_count}")
        return False

    elif word_count >= 500:
        print(f"{univ_name}'s {column_data} section has {word_count} words'")
        subtracted_bow_list = [item for item in univ_bow if item not in stopwords_list]
        counted_bow_keywords = Counter(subtracted_bow_list)
        counted_bow_tuple_list = list(Counter(counted_bow_keywords).items())
        zipped_bow_words, bow_frequency_list = map(list, zip(*counted_bow_tuple_list)) 
        corpus = " ".join(zipped_bow_words)
        print(corpus[:100])
        return corpus    

In [10]:
# 그래프에 retina display 적용
%config InlineBackend.figure_format = 'retina'

# 워드클라우드를 위한 나눔고딕 경로 설정
import matplotlib.font_manager as fm
fontpath = './font/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)

In [11]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

backgroundcolor = "white"
width=800
height=600
wordcloud = WordCloud(font_path = fontpath,
                        # stopwords = STOPWORDS, 
                        background_color = backgroundcolor, 
                        max_words= 50,
                        max_font_size= 300,
                        width = width, 
                        height = height)

In [12]:
# tokenizing sentences into nouns, verbs, etc
from konlpy.tag import Mecab

def generate_wordcloud_mecab(df_specific, column_data: str, stopwords_list: list,  corpus: str, univ_no: int, wordcloud):
    mecab_tokenizer = Mecab()

    # tokenization for mecab: http://openuiz.blogspot.com/2016/07/mecab-ko-dic.html
    mecab_pos = ["NNG", "NNP", "VV", "VA", "MAG"]

    mecab_keywords = [
            word
            for word, tag in mecab_tokenizer.pos(corpus, flatten=True)
            if len(word) > 1 and tag in mecab_pos and word not in stopwords_list
        ]
    
    univ_name = df_specific["대학명"][univ_no]
    print(univ_name)
    print(f"number of keywords: {len(mecab_keywords)}")
    print(mecab_keywords[:10])

    # 빈도수 별로 단어를 크게 표시한다
    str_keywords = ' '.join(mecab_keywords)
    mecab_wordcloud = wordcloud.generate(str_keywords)
    # plt.figure(figsize = (15 , 10))
    # plt.imshow(wordcloud)
    # plt.axis("off")
    # plt.show() 

    # save wordcloud png as file
    college_code = df_specific["대학코드"][univ_no]
    wordcloud.to_file(f"{MECAB_PATH}/{column_data}/{college_code}_{column_data}_mecab.png")



In [21]:
# tokenizing sentences into nouns, verbs, etc
from konlpy.tag import Okt

def okt_kor_tokenizer(raw, stopwords):
    okt_tokenizer = Okt()
    # position arguments(or position tags): 내가 뽑아내고 싶은 형태소들
    twitter_pos = ["Noun", "Alpha", "Verb", "Number", "Adverb"]
    twitter_list = [
        word
        # normalize 그랰ㅋㅋ -> 그래ㅋㅋ  # stemming 바뀌나->바뀌다
        for word, tag in okt_tokenizer.pos(raw, norm=True, stem=True)
        if len(word) > 1 and tag in twitter_pos and word not in stopwords
    ]
    return twitter_list


def generate_wordcloud_okt(df_specific, column_data: str, stopwords_list: list,  corpus: str, univ_no: int, wordcloud):
    
    okt_words = okt_kor_tokenizer(corpus, stopwords_list)
    okt_words = [item.replace('추다', '춥다') for item in okt_words]
    okt_words = [item.replace('걸다', '걷다') for item in okt_words]
    okt_words_counted = Counter(okt_words)
    print("number of counted words:", len(okt_words_counted))
    
    univ_name = df_specific["대학명"][univ_no]
    print(univ_name)
    print(f"number of keywords: {len(okt_words_counted)}")

    # 빈도수 별로 단어를 크게 표시한다
    wordcloud.generate_from_frequencies(frequencies = okt_words_counted) # mecab으로 추출한 명사로 워드클라우드를 그린다.
    # plt.figure(figsize = (15 , 10))
    # plt.imshow(wordcloud)
    # plt.axis("off")
    # plt.show() 

    # save wordcloud png as file
    college_code = df_specific["대학코드"][univ_no]
    wordcloud.to_file(f"{OKT_PATH}/{column_data}/{college_code}_{column_data}_okt.png")

In [14]:
column_data = "env_info"
sample_univ_no = 206

In [15]:
sample_corpus = throw_corpus(df_specific, column_data , stopwords_list, univ_no = sample_univ_no)

Public University of Navarre's env_info section has 2050 words'
앞서 말씀드렸듯이 내에는 카페테리아랑 도서관 스포츠센터가 존재하며 스포츠센터 시설은 되어있는 편입니다 테니스 신청하였는데 학생들에게는 스포츠 수업이나 시설 이용료가 싼 가격에 제공


In [16]:
generate_wordcloud_mecab(df_specific, column_data , stopwords_list, sample_corpus, sample_univ_no, wordcloud)

Public University of Navarre
number of keywords: 699
['앞서', '카페테리아', '도서관', '스포츠센터', '존재', '스포츠센터', '시설', '테니스', '신청', '스포츠']


In [17]:
generate_wordcloud_okt(df_specific, column_data , stopwords_list, sample_corpus, sample_univ_no, wordcloud)

number of counted words: 493
Public University of Navarre
number of keywords: 493


In [18]:
no_wordcloud_gen_info = []
no_wordcloud_env_info = []
no_wordcloud_food_info = []
no_wordcloud_study_info = []

In [19]:
column_data = "gen_info"

for i in range(470):
    corpus = throw_corpus(df_specific, column_data , stopwords_list, univ_no = i)
    if corpus is False:
        college_code = df_specific["대학코드"][i]
        no_wordcloud_gen_info.append(college_code)
        pass
    else:
        generate_wordcloud_mecab(df_specific, column_data , stopwords_list, corpus, i, wordcloud)
        generate_wordcloud_okt(df_specific, column_data , stopwords_list, corpus, i, wordcloud)

print(no_wordcloud_gen_info)

bama
number of keywords: 534
['동쪽', '조지', '서쪽', '미시시피', '플로리다', '면적', '대한민국', '자랑', '버밍엄', '몽고메리']
number of counted words: 439
University of Alabama
number of keywords: 439
University of Arkansas's gen_info has not enough words: 102
University of California Berkeley's gen_info section has 31853 words'
연대와 크기가 비슷하다고 말씀하시는 많은데 체감 상 연대보다 넓게 느껴졌습니다 수업인데도 멀리 떨어져 건물에서 많아서 캠퍼스를 걸어 다녔습니다 날씨는 춥습니다 햇빛은 강해도 바람이
University of California Berkeley
number of keywords: 4932
['체감', '멀리', '햇빛', '바람', '하루', '종일', '반팔', '후드', '야상', '종류']
number of counted words: 2281
University of California Berkeley
number of keywords: 2281
University of California Davis's gen_info section has 13815 words'
크기가 넓어서 자전거 다녀야했어요 처음엔 헤맸는데 지리가 금방 익숙해져서 괜찮았어요 캘리포니아 날씨가 따뜻하다고 했는데 데이비스는 북부 캘리포니아라 겨울에 쌀쌀했어요 면적은 넓지만
University of California Davis
number of keywords: 2580
['자전거', '지리', '금방', '괜찮', '캘리포니아', '데이비스', '북부', '캘리포니아', '겨울', '면적']
number of counted words: 1388
University of California Davis
number of keywords: 1388
Univer

### lack of gen_info
['AR000004', 'AU000002', 'AU000010', 'AU000014', 'AT000002', 'BE000003', 'BE000002', 'CA000008', 'CA000009', 'CA000010', 'CA000021', 'CA000019', 'CN000021', 'CN000012', 'CN000019', 'CR000002', 'DK000005', 'FI000001', 'FI000005', 'FI000011', 'FI000012', 'FR000008', 'FR000014', 'FR000017', 'FR000020', 'FR000025', 'FR000029', 'FR000030', 'FR000033', 'DE000004', 'DE000005', 'DE000007', 'DE000013', 'DE000014', 'ID000001', 'IE000003', 'IT000009', 'IT000010', 'IT000011', 'JP000001', 'JP000009', 'JP000010', 'JP000016', 'JP000017', 'JP000018', 'JP000036', 'JP000020', 'JP000026', 'JP000028', 'JP000032', 'JP000035', 'KZ000002', '730     ', 'MX000009', 'MX000014', 'MN000001', 'NL000015', 'NL000009', 'PH000002', 'PH000001', 'PL000001', 'SG000004', 'ES000018', 'ES000006', 'ES000008', 'ES000010', 'ES000011', 'ES000012', 'ES000013', 'SE000002', 'SE000004', 'SE000012', 'CH000001', 'CH000006', 'TW000001', 'TW000009', 'TW000005', 'TW000007', 'TH000002', 'TR000001', 'GB000027', 'GB000031', 'GB000037', 'GB000001', 'GB000028', 'GB000002', 'GB000003', 'GB000004', 'GB000038', 'GB000030', 'GB000007', 'GB000008', 'GB000014', 'GB000016', 'GB000020', 'US000002', 'US000303', 'US000004', 'US000011', 'US000282', 'US000018', 'US000029', 'US000301', 'US000043', 'US000044', 'US000047', 'US000048', 'US000293', 'US000049', 'US000050', 'US000053', 'US000056', 'US000024', 'US000025', 'US000026', 'US000036', 'US000032', 'US000033', 'US000057', 'US000400', 'US000063', 'US000064', 'US000065', 'US000068', 'US000075', 'US000082', 'US000084', 'US000088', 'US000093', 'US000097', 'US000099', 'US000100', 'US000102', 'US000106', 'US000112', 'US000113', 'US000120', 'US000124', 'US000126', 'US000129', 'US000281', 'US000133', 'US000134', 'US000157', 'US000161', 'US000162', 'US000165', 'US000166', 'US000167', 'US000280', 'US000183', 'US000187', 'US000203', 'US000210', 'US000286', 'US000213', 'US000218', 'US000219', 'US000220', 'US000221', 'US000226', 'US000227', 'US000228', 'US000287', 'US000236', 'US000238', 'US000243', 'US000245', 'US000254', 'US000292', 'US000255', 'US000258', 'US000263', 'US000265', 'US000267', 'US000268', 'US000274', 'US000278']


In [25]:
column_data = "env_info"

for i in range(470):
    corpus = throw_corpus(df_specific, column_data , stopwords_list, univ_no = i)
    if corpus is False:
        college_code = df_specific["대학코드"][i]
        no_wordcloud_env_info.append(college_code)
        pass
    else:
        generate_wordcloud_mecab(df_specific, column_data , stopwords_list, corpus, i, wordcloud)
        generate_wordcloud_okt(df_specific, column_data , stopwords_list, corpus, i, wordcloud)

print(no_wordcloud_env_info)

Universidad del Salvador's env_info has not enough words: 38
Australian National University's env_info section has 1732 words'
자체는 자연입니다 정문에서 걸어서 정도면 생긴지 얼마 안된 쇼핑센터가 있구요 영화관 클럽 전부 가까워요 캔버라 자체가 도시입니다만 기숙사에만 거주하신다면 대중교통을 이용할 필요 없
Australian National University
number of keywords: 471
['자체', '자연', '정문', '얼마', '쇼핑', '센터', '영화관', '클럽', '전부', '캔버라']
number of counted words: 371
Australian National University
number of keywords: 371
Bond University's env_info section has 1739 words'
주변은 조용하고 깨끗한 평화로운 전원마을의 느낌입니다 분정도 걸어나가면 병원 약국 이용할 괜찮은 식당들이 밖에서 밥을 먹을 땐 보통 이곳에서 먹었습니다 호수를 주변으로 산책로도 예
Bond University
number of keywords: 492
['평화', '전원', '마을', '느낌', '분정', '병원', '약국', '괜찮', '보통', '호수']
number of counted words: 347
Bond University
number of keywords: 347
Curtin University's env_info has not enough words: 482
Curtin University of Technology's env_info section has 1013 words'
커튼 주변에는 제외하고 별다른 커튼이 벤틀리는 존과 존 사이에 교통 요금이 들지는 공항에서도 거리이고 시티까지도 분이면 대중교통이 발달해 있지 않기 대중교통을 이용한다면 배로 걸립
Curtin University of Tech

KeyboardInterrupt: 

['AR000004', 'AU000002', 'AU000010', 'AU000014', 'AT000002', 'AT000003', 'BE000003', 'BE000002', 'CA000008', 'CA000009', 'CA000010', 'CA000021', 'CA000017', 'CA000019', 'CN000021', 'CN000012', 'CN000019', 'CR000002', 'DK000005', 'DK000002', 'FI000001', 'FI000005', 'FI000011', 'FI000012', 'FR000004', 'FR000007', 'FR000008', 'FR000012', 'FR000014', 'FR000016', 'FR000017', 'FR000020', 'FR000025', 'FR000029', 'FR000030', 'FR000033', 'DE000004', 'DE000005', 'DE000007', 'DE000013', 'DE000014', 'ID000001', 'IE000003', 'IT000009', 'IT000010', 'IT000011', 'JP000001', 'JP000009', 'JP000010', 'JP000016', 'JP000017', 'JP000018', 'JP000036', 'JP000020', 'JP000026', 'JP000028', 'JP000029', 'JP000032', 'JP000035', 'KZ000002', '730     ', 'MX000004', 'MX000009', 'MX000014', 'MN000001', 'NL000014', 'NL000015', 'NL000009', 'NL000011', 'PH000002', 'PH000001', 'PL000001', 'SG000004', 'ES000018', 'ES000006', 'ES000008', 'ES000010', 'ES000011', 'ES000012', 'ES000013', 'SE000002', 'SE000004', 'SE000012', 'CH000001', 'CH000006', 'TW000001', 'TW000002', 'TW000009', 'TW000005', 'TW000007', 'TH000002', 'TH000003', 'TR000002', 'TR000001', 'GB000027', 'GB000031', 'GB000037', 'GB000001', 'GB000028', 'GB000002', 'GB000003', 'GB000004', 'GB000038', 'GB000039', 'GB000030', 'GB000007', 'GB000008', 'GB000011', 'GB000014', 'GB000016', 'GB000020', 'US000001', 'US000002', 'US000303', 'US000004', 'US000011', 'US000282', 'US000018', 'US000029', 'US000301', 'US000043', 'US000044', 'US000045', 'US000047', 'US000048', 'US000293', 'US000049', 'US000050', 'US000053', 'US000056', 'US000024', 'US000025', 'US000026', 'US000021', 'US000036', 'US000032', 'US000033', 'US000057', 'US000400', 'US000063', 'US000064', 'US000065', 'US000068', 'US000075', 'US000082', 'US000084', 'US000088', 'US000093', 'US000097', 'US000099', 'US000100', 'US000102', 'US000106', 'US000112', 'US000113', 'US000120', 'US000124', 'US000126', 'US000129', 'US000281', 'US000133', 'US000134', 'US000157', 'US000161', 'US000162', 'US000165', 'US000166', 'US000167', 'US000280', 'US000183', 'US000187', 'US000203', 'US000208', 'US000210', 'US000286', 'US000213', 'US000218', 'US000219', 'US000220', 'US000221', 'US000226', 'US000227', 'US000228', 'US000287', 'US000236', 'US000238', 'US000243', 'US000245', 'US000254', 'US000292', 'US000255', 'US000258', 'US000263', 'US000265', 'US000267', 'US000268', 'US000274', 'US000278']

In [None]:
# 주변, 시내, 이기, 시설, 이용, 거리, 음식점, 가다, 나가다, 마트, 레스토랑, 지역
# 걸다 -> 걷다