In [1]:
import pandas as pd
import os
import random
import numpy as np

dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}  

path = '/opt/ml/input/data'

### `feature` 폴더 생성

In [2]:
if not os.path.isdir(os.path.join(path,'feature')):
    os.mkdir(os.path.join(path,'feature'))
os.path.isdir(os.path.join(path,'feature'))

True

### data불러오기
- dtype을 명시해준체로 불러옵니다.
- 불러오면 userID, 시간 순서로 정렬해줍니다.
- test_data는 정답을 맞춰야하는 -1을 제외하고 합쳐줍니다.

In [3]:
train_df = pd.read_csv(os.path.join(path,'train_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
train_df = train_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [4]:
test_df = pd.read_csv(os.path.join(path,'test_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
test_df = test_df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)
test_df = test_df[test_df['answerCode']!= -1].copy()

In [5]:
train_df = pd.concat([train_df,test_df])

### feature 추출후 합쳐주기

- 변수 뜻은 노션 참고

In [6]:
# testId(시험지)의 평균 정답률을 구합니다.(시헙지별 난이도)
testId_acc = train_df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
testId_acc.columns = ["test_mean", 'test_sum']

# 마찬가지로 KnowledgedTag의 평균 정답률을 구하여 대략적인 난이도를 구합니다
knowLedgedTag_acc = train_df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
knowLedgedTag_acc.columns = ["tag_mean", 'tag_sum']

# 아래 데이터는 제출용 데이터셋에 대해서도 재사용 합니다.
train_df = pd.merge(train_df, testId_acc, on=['testId'], how="left")
train_df = pd.merge(train_df, knowLedgedTag_acc, on=['KnowledgeTag'], how="left")

diff = train_df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    
# 만약 0초만에 풀었으면 0으로 치환
diff = diff.fillna(pd.Timedelta(seconds=0))

# 시간을 전부 초단위로 변경합니다.
diff = diff['Timestamp'].apply(lambda x: x.total_seconds())

# df에 elapsed(문제 풀이 시간)을 추가해줍니다.
train_df['t_elapsed'] = diff

# 문제 풀이 시간이 650초 이상은 이상치로 판단하고 제거합니다.
train_df['t_elapsed'] = train_df['t_elapsed'].apply(lambda x : x if x <650 else None)

# 대분류(앞 세자리)
train_df['i_head'] = train_df['testId'].apply(lambda x : int(x[1:4])//10)

# 중분류(중간 세자리)
train_df['i_mid'] = train_df['testId'].apply(lambda x : int(x[-3:]))

# 문제 번호(분류를 제외한)
train_df['i_tail'] = train_df['assessmentItemID'].apply(lambda x : int(x[-3:]))


### user 피쳐 추출
- 변수 뜻은 노션 참고

In [8]:
user_feature = train_df.groupby(['userID','i_head']).agg({
    'answerCode':['mean', 'count'],
    't_elapsed':['mean']
})
user_feature.reset_index(inplace=True)
user_feature

Unnamed: 0_level_0,userID,i_head,answerCode,answerCode,t_elapsed
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,count,mean
0,0,2,0.722222,36,37.206897
1,0,6,0.791908,346,36.533784
2,0,8,0.468320,363,39.261981
3,1,4,0.822719,581,96.679204
4,1,9,0.903409,352,103.672297
...,...,...,...,...,...
19734,7439,4,0.733333,15,38.384615
19735,7440,3,0.400000,10,29.125000
19736,7440,5,0.400000,5,18.600000
19737,7441,3,0.200000,5,44.000000


In [10]:
user_feature.columns = ["userID","i_head","u_head_mean","u_head_count", "u_head_elapsed"]


In [12]:
user_feature.head(5)

Unnamed: 0,userID,i_head,u_head_mean,u_head_count,u_head_elapsed
0,0,2,0.722222,36,37.206897
1,0,6,0.791908,346,36.533784
2,0,8,0.46832,363,39.261981
3,1,4,0.822719,581,96.679204
4,1,9,0.903409,352,103.672297


In [13]:
# 저장해줍니다.
user_feature.to_csv("/opt/ml/input/data/feature/userID_feature.csv")


### testId(시험지) feature 추출
- 변수 뜻은 노션 참고

In [20]:
len_seq = lambda x : len(set(x))

testId_feature = train_df.groupby(['testId']).agg({
    't_elapsed': 'mean',
    'answerCode':['mean', 'sum'],
    'i_tail':'max',
    'KnowledgeTag':len_seq
})
testId_feature.reset_index(inplace=True)
testId_feature['i_head']=testId_feature['testId'].apply(lambda x : int(x[1:4])//10)
testId_feature['i_mid']=testId_feature['testId'].apply(lambda x : int(x[-3:]))

testId_feature.head()

Unnamed: 0_level_0,testId,t_elapsed,answerCode,answerCode,i_tail,KnowledgeTag,i_head,i_mid
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,sum,max,<lambda>,Unnamed: 7_level_1,Unnamed: 8_level_1
0,A010000001,33.949286,0.923429,1616.0,5,1,1,1
1,A010000002,34.218902,0.931429,1630.0,5,2,1,2
2,A010000003,36.326211,0.842857,1475.0,5,3,1,3
3,A010000004,34.215713,0.880571,1541.0,5,3,1,4
4,A010000005,36.883227,0.849143,1486.0,5,2,1,5


In [26]:
testId_feature.columns = ['testId','i_mid_elapsed','i_mid_mean','i_mid_sum' ,'i_mid_count', 'i_mid_tag_count', 'i_head', 'i_mid']
testId_feature = testId_feature[['testId','i_mid_elapsed','i_mid_mean','i_mid_sum' ,'i_mid_count', 'i_mid_tag_count']]
testId_feature.head()

Unnamed: 0,testId,i_mid_elapsed,i_mid_mean,i_mid_sum,i_mid_count,i_mid_tag_count
0,A010000001,33.949286,0.923429,1616.0,5,1
1,A010000002,34.218902,0.931429,1630.0,5,2
2,A010000003,36.326211,0.842857,1475.0,5,3
3,A010000004,34.215713,0.880571,1541.0,5,3
4,A010000005,36.883227,0.849143,1486.0,5,2


In [27]:
testId_feature.to_csv("/opt/ml/input/data/feature/testId_feature.csv")
