In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Preprocess Curri and Track of Major

In [4]:
amth_curri = pd.read_csv('./Data/Raw/amth_curri.csv')
amth_track = pd.read_csv('./Data/Raw/amth_track.csv')

In [5]:
is_curri = pd.read_csv('./Data/Raw/is_curri.csv')
is_track = pd.read_csv('./Data/Raw/is_track.csv')

In [6]:
is_curri = is_curri.rename(columns = {'1': '1학기', '2': '2학기'})

In [7]:
major_curri = pd.concat([amth_curri, is_curri])
major_track = pd.concat([amth_track, is_track])

In [8]:
def preprocess_curri(curri, track):
    data = curri.merge(track, how = 'left', on = '교과목명')

    data['연중'] = data[['1학기', '2학기']].notna().all(axis = 1)

    data = data[['학수번호', '트랙', '연중']]

    return data

In [9]:
major_curri_pre = preprocess_curri(major_curri, major_track)

# Preprocess Regis of Major and Elect

In [11]:
amth_regis_1 = pd.read_csv('./Data/Raw/amth_regis_1.csv')
amth_regis_2 = pd.read_csv('./Data/Raw/amth_regis_2.csv')

In [12]:
is_regis = pd.read_csv('./Data/Raw/is_regis.csv')

In [13]:
gec_regis_1 = pd.read_csv('./Data/Raw/gec_regis_1.csv')
gec_regis_2 = pd.read_csv('./Data/Raw/gec_regis_2.csv')
gec_regis_3 = pd.read_csv('./Data/Raw/gec_regis_3.csv')

In [14]:
ged_regis_1 = pd.read_csv('./Data/Raw/ged_regis_1.csv')
ged_regis_2 = pd.read_csv('./Data/Raw/ged_regis_2.csv')
ged_regis_3 = pd.read_csv('./Data/Raw/ged_regis_3.csv')
ged_regis_4 = pd.read_csv('./Data/Raw/ged_regis_4.csv')
ged_regis_5 = pd.read_csv('./Data/Raw/ged_regis_5.csv')

In [15]:
amth_regis = pd.concat([amth_regis_1, amth_regis_2])

In [16]:
amth_regis['전공2'] = 1
is_regis['전공1'] = 1

In [17]:
major_regis = pd.concat([amth_regis, is_regis])

In [18]:
gec_regis = pd.concat([gec_regis_1, gec_regis_2, gec_regis_3])

In [19]:
ged_regis_1['트랙'] = '생명우주인간'
ged_regis_2['트랙'] = '분석추론논리'
ged_regis_3['트랙'] = '상징문화소통'
ged_regis_4['트랙'] = '사회공동체평화'
ged_regis_5['트랙'] = '지능정보미래'

In [20]:
ged_regis = pd.concat([ged_regis_1, ged_regis_2, ged_regis_3, ged_regis_4, ged_regis_5])

In [21]:
elect_regis = pd.concat([gec_regis, ged_regis])

In [22]:
elect_regis['교양'] = 1

In [23]:
def get_number(x):
    value = x.split('-')[0]

    return value

In [24]:
major_regis['학수번호'] = major_regis['학수번호-분반'].apply(get_number)
elect_regis['학수번호'] = elect_regis['학수번호-분반'].apply(get_number)

In [25]:
major_regis = major_regis.merge(major_curri_pre, how = 'inner', on = '학수번호')

In [26]:
total_regis = pd.concat([major_regis, elect_regis])

In [27]:
total_regis['대상학년'] = total_regis['대상학년'].fillna(1)

In [28]:
total_regis.loc[total_regis['강좌명'] == '주제연구', '대상학년'] = 2

In [29]:
def get_require(x):
    require_list = [4, 11, 16, 14]

    value = 0

    if x in require_list:
        value = 1

    return value

In [30]:
total_regis['필수'] = total_regis['이수구분'].apply(get_require)

In [31]:
total_regis[['전공1', '전공2', '교양']] = total_regis[['전공1', '전공2', '교양']].fillna(0)

In [32]:
total_regis['연중'] = total_regis['교양'].fillna(False)

In [33]:
def get_online(x):
    value = 1
    
    if len(x) > 18:
        value = 0

    return value

In [34]:
total_regis['온라인'] = total_regis['강의시간/강의실'].apply(get_online)

In [35]:
total_regis_on = total_regis.loc[total_regis['온라인'] == 1]
total_regis_off = total_regis.loc[total_regis['온라인'] == 0]

In [36]:
def get_hour_and_room(x):
    value = x.split()

    n = len(value)

    day1 = value[0]
    day2 = '-'

    if n != 3:

        if n == 6:
            day2 = value[3]

        else:
            day2 = value[7]

    hour = value[1]

    room = value[2].strip('()')[0]

    return pd.Series([day1, day2, hour, room])

In [37]:
total_regis_off[['요일1', '요일2', '시간', '장소']] = total_regis_off['강의시간/강의실'].apply(get_hour_and_room)

In [38]:
day_dict = {'월': 1, '화': 2, '수': 3, '목': 4, '금': 5, '토': 6, '일': 7}

In [39]:
total_regis_off['요일1'] = total_regis_off['요일1'].map(day_dict)
total_regis_off['요일2'] = total_regis_off['요일2'].map(day_dict)

In [40]:
place_dict = {'우': 1, '멀': 1, '국': 2, '전': 2}

In [41]:
total_regis_off['장소'] = total_regis_off['장소'].map(place_dict)

In [42]:
def label_hour(x):
    start, end = x.split('-')

    start_hour, end_hour = int(start.split(':')[0]), int(end.split(':')[0])

    label_dict = {7: 0, 9: 1, 10: 2, 12: 3, 13: 4, 15: 5, 16: 6, 18: 7}

    hour1 = label_dict[start_hour]
    hour2 = np.nan

    if end_hour - start_hour > 1:
        hour2 = hour1 + 1

    return pd.Series([hour1, hour2])

In [43]:
total_regis_off[['시간1', '시간2']] = total_regis_off['시간'].apply(label_hour)

In [44]:
total_regis = pd.concat([total_regis_off, total_regis_on])

In [45]:
total_regis[['요일1', '요일2', '장소', '시간1', '시간2']] = total_regis[['요일1', '요일2', '장소', '시간1', '시간2']].fillna(0)

In [46]:
cols = ['대상학년', '학점', '전공1', '전공2', '연중', '교양', '요일1', '요일2', '장소', '시간1', '시간2']

In [47]:
total_regis[cols] = total_regis[cols].astype('int')

In [48]:
cols = ['정원', '강의시간/강의실', '언어구분', '강의계획서', '시간']

In [49]:
total_regis = total_regis.drop(columns = cols)

In [50]:
total_regis.to_csv('./Data/Preprocess/total_regis_1.csv', index = 0)