In [2]:
UNIV_COLUMNS = ['대학코드', '국가', '대학명', '파견구분', '협정형태', 'VISIT_NO', "BERT_SUM", "RNN_SUM", 'href']
ABSTRACT_REVIEWS_COLUMNS = ['대학코드', '제목', '학과', '과정', '년도', 'BERT_SCORE', 'RNN_SCORE', 'href']
SPECIFIC_COLUMNS = ["대학코드", "대학명", "국가", "파견구분", "협정형태", "학과", "제목", "과정", "년도", "college_link", "review_link",  "gen_info","env_info","food_info","study_info","office_info" "facil_info","mhct_info","help_info","etc_info", "BERT_SCORE", "RNN_SCORE"]

In [3]:
import os
from os import fdopen, remove
import glob
from tempfile import mkstemp
import shutil
from shutil import move, copymode

import pandas as pd
import numpy as np

In [4]:
# define path for sentiment labeled dataset
# abstract: 짧은 후기 제목만 있는 텍스트 데이터셋
# specific: 문단으로 구성된 텍스트 데이터셋

abstract_sentiment = "./data_sentiment/abstract"
specific_sentiment = "./data_sentiment/specific"

In [5]:
# fetch all yonsei exchange review text datasets for each foreign universities
abstract_yonsei_reviews = glob.glob(f"{abstract_sentiment}/*.csv")
abstract_yonsei_reviews[:5]

['./data_sentiment/abstract/NL000006_review_abstract_sentiment.csv',
 './data_sentiment/abstract/FI000014_review_abstract_sentiment.csv',
 './data_sentiment/abstract/US000183_review_abstract_sentiment.csv',
 './data_sentiment/abstract/US000109_review_abstract_sentiment.csv',
 './data_sentiment/abstract/CA000002_review_abstract_sentiment.csv']

In [6]:
# look at sample dataset
sample_file = abstract_yonsei_reviews[70]
file_name = sample_file.split("/")[-1]
file_name_without_ext = file_name.split(".")[0]
print(file_name_without_ext)
univ_code = file_name_without_ext.split("_")[0]
print(univ_code)
df_abstract = pd.read_csv(sample_file, encoding="utf-8")
df_abstract.sample(5)

AU000016_review_abstract_sentiment
AU000016


Unnamed: 0.1,Unnamed: 0,No,제목,학과,과정,년도,href,BERT_SCORE,RNN_SCORE
33,33,14,너무나도 그리운 시드니,의용전자공학,학부,2005,/partner/expReport.asp?id=1854&page=4&bgbn=R,1,0.75316
0,0,47,"Warm, Welcoming, and Stimulating UTS","UIC, TAD (IID & CDM)",학부,2018,/partner/expReport.asp?id=14839&page=1&bgbn=R,0,0.516349
43,43,4,"University of Technology, Sydney",신문방송학과,학부,2002,/partner/expReport.asp?id=716&page=5&bgbn=R,1,0.51227
38,38,9,"University of Technology, Sydney",기계전자공학부,학부,2005,/partner/expReport.asp?id=1695&page=4&bgbn=R,1,0.51227
37,37,10,"University of Technology, Sydney",기계전자공학부,학부,2005,/partner/expReport.asp?id=1732&page=4&bgbn=R,1,0.51227


In [7]:
df_abstract_review = df_abstract[["제목", "학과", "과정", "년도", "BERT_SCORE", "RNN_SCORE", "href"]]
df_abstract_review["대학코드"] = univ_code
_abstract_review = df_abstract[["제목", "학과", "과정", "년도", "BERT_SCORE", "RNN_SCORE", "href"]]
df_abstract_review["대학코드"] = univ_code
df_abstract_review = df_abstract_review[["대학코드","제목", "학과", "과정", "년도", "BERT_SCORE", "RNN_SCORE", "href"]]
df_abstract_review.head()

Unnamed: 0,대학코드,제목,학과,과정,년도,BERT_SCORE,RNN_SCORE,href
0,AU000016,"Warm, Welcoming, and Stimulating UTS","UIC, TAD (IID & CDM)",학부,2018,0,0.516349,/partner/expReport.asp?id=14839&page=1&bgbn=R
1,AU000016,다시 돌아가고 싶은 UTS,의공학부,학부,2017,1,0.913062,/partner/expReport.asp?id=14400&page=1&bgbn=R
2,AU000016,행복했던 시드니에서 한학기,기계공학과,학부,2016,1,0.772298,/partner/expReport.asp?id=13591&page=1&bgbn=R
3,AU000016,시드니에서의 한 학기,응용통계학과,학부,2013,1,0.719899,/partner/expReport.asp?id=6898&page=1&bgbn=R
4,AU000016,시드니에서의 1년,경영학과,학부,2012-2013,0,0.719899,/partner/expReport.asp?id=6422&page=1&bgbn=R


In [8]:
df_abstract_review.columns.to_list()

['대학코드', '제목', '학과', '과정', '년도', 'BERT_SCORE', 'RNN_SCORE', 'href']

In [23]:
def make_abstract_review_df(UNIV_CODE):
    file_path = f"./data_sentiment/abstract/{UNIV_CODE}_review_abstract_sentiment.csv"
    df_abstract = pd.read_csv(file_path, encoding="utf-8")
    df_abstract_review = df_abstract[["제목", "학과", "과정", "년도", "BERT_SCORE", "RNN_SCORE", "href"]]
    df_abstract_review["대학코드"] = univ_code
    _abstract_review = df_abstract[["제목", "학과", "과정", "년도", "BERT_SCORE", "RNN_SCORE", "href"]]
    df_abstract_review["대학코드"] = univ_code
    df_abstract_review = df_abstract_review[["대학코드","제목", "학과", "과정", "년도", "BERT_SCORE", "RNN_SCORE", "href"]]
    return df_abstract_review

In [37]:
def yield_BERT_SUM(UNIV_CODE):
    file_path = f"./data_sentiment/abstract/{UNIV_CODE}_review_abstract_sentiment.csv"
    dataframe_input = pd.read_csv(file_path, encoding="utf-8")
    bert_sum = dataframe_input["BERT_SCORE"].sum()
    return bert_sum

In [41]:
def yield_RNN_SUM(UNIV_CODE):
    file_path = f"./data_sentiment/abstract/{UNIV_CODE}_review_abstract_sentiment.csv"
    dataframe_input = pd.read_csv(file_path, encoding="utf-8")
    rnn_sum = dataframe_input["RNN_SCORE"].sum()
    return rnn_sum

In [43]:
def yield_no_of_students(UNIV_CODE):
    dataframe_input = make_abstract_review_df(UNIV_CODE)
    no_of_students = len(dataframe_input.index)
    return no_of_students

In [25]:
print(yield_BERT_SUM("DK000003"))
print(yield_RNN_SUM("DK000003"))

65
49.14911192655563


In [27]:
print(yield_BERT_SUM("CN000016"))
print(yield_RNN_SUM("CN000016"))

41
43.059615552425385


In [10]:
df_univ = pd.read_csv("./data/univ_db_full.csv", encoding="utf-8")
df_univ.sample(5)

Unnamed: 0,level_0,index,No,대학명,국가,파견구분,협정형태,href,visitation
145,145,21,22,Universite de Lille III (Charles de Gaulle),FRANCE,교환학생,ISEP,/partner/expReport.asp?ucode=FR000016&bgbn=A,Exists
314,314,14,15,University of Leiden,NETHERLANDS,교환학생,교환대학ISEP,/partner/expReport.asp?ucode=NL000009&bgbn=A,Exists
3,3,2,3,Universidad de Palermo,ARGENTINA,교환학생,ISEP,/partner/expReport.asp?ucode=AR000003&bgbn=A,
388,388,1,2,Rangsit University,THAILAND,교환학생,USAC,/partner/expReport.asp?ucode=TH000002&bgbn=A,Exists
684,684,248,249,University of Notre Dame,UNITED STATES,교환학생,교환대학,/partner/expReport.asp?ucode=US000287&bgbn=A,Exists


In [11]:
series_query = df_univ["href"].str.split("=", expand=True)[1]
series_univ_code = series_query.str.split("&", expand=True)[0]
df_univ["대학코드"] = series_univ_code

In [12]:
df_univ.head()

Unnamed: 0,level_0,index,No,대학명,국가,파견구분,협정형태,href,visitation,대학코드
0,0,0,1,Kabul University,AFGHANISTAN,교환학생,교환대학,/partner/expReport.asp?ucode=AF000001&bgbn=A,,AF000001
1,1,0,1,Universidad Blas Pascal,ARGENTINA,교환학생,ISEP,/partner/expReport.asp?ucode=AR000001&bgbn=A,,AR000001
2,2,1,2,Universidad Catolica de Cordoba,ARGENTINA,교환학생,ISEP,/partner/expReport.asp?ucode=AR000002&bgbn=A,,AR000002
3,3,2,3,Universidad de Palermo,ARGENTINA,교환학생,ISEP,/partner/expReport.asp?ucode=AR000003&bgbn=A,,AR000003
4,4,3,4,Universidad del Salvador,ARGENTINA,교환학생,ISEP,/partner/expReport.asp?ucode=AR000004&bgbn=A,Exists,AR000004


In [62]:
df = df_univ[["대학코드", "국가", "대학명", "파견구분", "협정형태", "visitation", "href"]]
df.head(10)

Unnamed: 0,대학코드,국가,대학명,파견구분,협정형태,visitation,href
0,AF000001,AFGHANISTAN,Kabul University,교환학생,교환대학,,/partner/expReport.asp?ucode=AF000001&bgbn=A
1,AR000001,ARGENTINA,Universidad Blas Pascal,교환학생,ISEP,,/partner/expReport.asp?ucode=AR000001&bgbn=A
2,AR000002,ARGENTINA,Universidad Catolica de Cordoba,교환학생,ISEP,,/partner/expReport.asp?ucode=AR000002&bgbn=A
3,AR000003,ARGENTINA,Universidad de Palermo,교환학생,ISEP,,/partner/expReport.asp?ucode=AR000003&bgbn=A
4,AR000004,ARGENTINA,Universidad del Salvador,교환학생,ISEP,Exists,/partner/expReport.asp?ucode=AR000004&bgbn=A
5,AR000005,ARGENTINA,University of Buenos Aires,교환학생,교환대학,,/partner/expReport.asp?ucode=AR000005&bgbn=A
6,AU000019,AUSTRALIA,Australian National University,교환학생,교환대학,Exists,/partner/expReport.asp?ucode=AU000019&bgbn=A
7,AU000001,AUSTRALIA,Bond University,교환학생,교환대학,Exists,/partner/expReport.asp?ucode=AU000001&bgbn=A
8,AU000002,AUSTRALIA,Curtin University,교환학생,교환대학ISEP,Exists,/partner/expReport.asp?ucode=AU000002&bgbn=A
9,AU000003,AUSTRALIA,Curtin University of Technology,교환학생,,Exists,/partner/expReport.asp?ucode=AU000003&bgbn=A


In [34]:
df_ever_been = df.loc[df['visitation'] == "Exists"]
df_ever_been.head()

Unnamed: 0,대학코드,국가,대학명,파견구분,협정형태,visitation,href
4,AR000004,ARGENTINA,Universidad del Salvador,교환학생,ISEP,Exists,/partner/expReport.asp?ucode=AR000004&bgbn=A
6,AU000019,AUSTRALIA,Australian National University,교환학생,교환대학,Exists,/partner/expReport.asp?ucode=AU000019&bgbn=A
7,AU000001,AUSTRALIA,Bond University,교환학생,교환대학,Exists,/partner/expReport.asp?ucode=AU000001&bgbn=A
8,AU000002,AUSTRALIA,Curtin University,교환학생,교환대학ISEP,Exists,/partner/expReport.asp?ucode=AU000002&bgbn=A
9,AU000003,AUSTRALIA,Curtin University of Technology,교환학생,,Exists,/partner/expReport.asp?ucode=AU000003&bgbn=A


In [None]:
for index, row in df_ever_been.iterrows():
    print(row["대학명"], yield_BERT_SUM(row["대학코드"]), yield_RNN_SUM(row["대학코드"]), yield_no_of_students(row["대학코드"]))

In [14]:
df.columns.to_list()

['대학코드', '국가', '대학명', '파견구분', '협정형태', 'visitation', 'href']

In [59]:
def make_univ_df():
    # read university dataset csv file
    df_univ = pd.read_csv("./data/univ_db_full.csv", encoding="utf-8")

    # get unive~rsity code
    series_query = df_univ["href"].str.split("=", expand=True)[1]
    series_univ_code = series_query.str.split("&", expand=True)[0]
    df_univ["대학코드"] = series_univ_code
    
    # Making column values for BERT_SUM, RNN_SUM, STUDENT_NO
    list_bert_sum = []
    list_rnn_sum = []
    list_student_no = []
    
    for index, row in df_univ.iterrows():
        if row["visitation"] == "None":
            list_bert_sum.append(np.NaN)
            list_rnn_sum.append(np.NaN)
            list_student_no.append(np.NaN)

        if row["visitation"] == "Exists":
            list_bert_sum.append(yield_BERT_SUM(row["대학코드"]))
            list_rnn_sum.append(yield_RNN_SUM(row["대학코드"]))
            list_student_no.append(yield_no_of_students(row["대학코드"]))
    
    df_univ["BERT_SUM"] = list_bert_sum
    df_univ["RNN_SUM"] = list_rnn_sum
    df_univ["STUDENT_NO"] = list_student_no
    
    df = df_univ[["대학코드", "국가", "대학명", "파견구분", "협정형태", "STUDENT_NO", "BERT_SUM", "RNN_SUM", "href"]]
    return df

In [61]:
make_univ_df().head(10)

Unnamed: 0,대학코드,국가,대학명,파견구분,협정형태,STUDENT_NO,BERT_SUM,RNN_SUM,href
0,AF000001,AFGHANISTAN,Kabul University,교환학생,교환대학,,,,/partner/expReport.asp?ucode=AF000001&bgbn=A
1,AR000001,ARGENTINA,Universidad Blas Pascal,교환학생,ISEP,,,,/partner/expReport.asp?ucode=AR000001&bgbn=A
2,AR000002,ARGENTINA,Universidad Catolica de Cordoba,교환학생,ISEP,,,,/partner/expReport.asp?ucode=AR000002&bgbn=A
3,AR000003,ARGENTINA,Universidad de Palermo,교환학생,ISEP,,,,/partner/expReport.asp?ucode=AR000003&bgbn=A
4,AR000004,ARGENTINA,Universidad del Salvador,교환학생,ISEP,1.0,0.0,0.39967,/partner/expReport.asp?ucode=AR000004&bgbn=A
5,AR000005,ARGENTINA,University of Buenos Aires,교환학생,교환대학,,,,/partner/expReport.asp?ucode=AR000005&bgbn=A
6,AU000019,AUSTRALIA,Australian National University,교환학생,교환대학,27.0,17.0,17.952732,/partner/expReport.asp?ucode=AU000019&bgbn=A
7,AU000001,AUSTRALIA,Bond University,교환학생,교환대학,19.0,16.0,13.359809,/partner/expReport.asp?ucode=AU000001&bgbn=A
8,AU000002,AUSTRALIA,Curtin University,교환학생,교환대학ISEP,7.0,5.0,4.657787,/partner/expReport.asp?ucode=AU000002&bgbn=A
9,AU000003,AUSTRALIA,Curtin University of Technology,교환학생,,16.0,14.0,9.69667,/partner/expReport.asp?ucode=AU000003&bgbn=A
