In [None]:
import numpy as np
import pandas as pd
import datetime
from catboost import CatBoostClassifier, Pool
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import seaborn as sns

In [None]:
from sklearn.metrics import confusion_matrix
# this function is the quadratic weighted kappa (the metric used for the competition submission)
def qwk(act,pred,n=4,hist_range=(0,3)):
    
    # Calculate the percent each class was tagged each label
    O = confusion_matrix(act,pred)
    # normalize to sum 1
    O = np.divide(O,np.sum(O))
    
    # create a new matrix of zeroes that match the size of the confusion matrix
    # this matriz looks as a weight matrix that give more weight to the corrects
    W = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            # makes a weird matrix that is bigger in the corners top-right and botton-left (= 1)
            W[i][j] = ((i-j)**2)/((n-1)**2)
            
    # make two histograms of the categories real X prediction
    act_hist = np.histogram(act,bins=n,range=hist_range)[0]
    prd_hist = np.histogram(pred,bins=n,range=hist_range)[0]
    
    # multiply the two histograms using outer product
    E = np.outer(act_hist,prd_hist)
    E = np.divide(E,np.sum(E)) # normalize to sum 1
    
    # apply the weights to the confusion matrix
    num = np.sum(np.multiply(W,O))
    # apply the weights to the histograms
    den = np.sum(np.multiply(W,E))
    
    return 1-np.divide(num,den)

In [None]:
train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')

# encode title

In [None]:
# train title값과 test title값의 합집합을 구하기 위해 set()을 이용하여 index 추출
list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
list_of_user_activities

In [None]:
# train event_code값과 test event_code값의 합집합을 구하기 위해 set()을 이용하여 index 추출
list_of_event_code = list(set(train['event_code'].value_counts().index).union(set(test['event_code'].value_counts().index)))
list_of_event_code

In [None]:
# zip은 동일한 개수로 이루어진 자료형을 묶어주는 역할을 하는 함수이다.
# title값에 번호를 매겨줌
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
activities_map

In [None]:
activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
activities_labels

In [None]:
# 넘버링된 타이틀을 train, test에 mapping 시켜줌
train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [None]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [None]:
# Bird Measurer를 제외한 나머지 들은 4100, Bird Measurer는 4110으로 ..
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [None]:
def get_data(user_sample, test_set = False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    '''
    user_sample은 train, test의 데이터 프레임이고, 여기서만 installation_id가 필터링됨
    test_set = False일때 test_set 파라미터는 labels processing과 관련있음
    '''
    
    #새로운 변수: 각 activitiy 시간들..
    time_spent_each_act = {actv: 0 for actv in list_of_user_activities}
    event_code_count = {eve: 0 for eve in list_of_event_code}
    last_session_time_sec = 0
    
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    
    # i : game_session, session = frame
    for i, session in user_sample.groupby('game_session', sort = False):
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        
        # 현재 세션 시간을 초단위로 가져옴
        if session_type != 'Assessment':
            time_spent = int(session['game_time'].iloc[-1] / 1000)
            time_spent_each_act[activities_labels[session_title]] += time_spent
       
        # 각 assessment,그리고 아래와 같은 오프세션에서만 아래의 특징들이 처리되고, 레지스터를 생성함
        if (session_type =='Assessment') & (test_set or len(session)>1):
            # 평가를 나타내는 event_code의 4100을 찾습니다
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # 승, 패 수를 샌다
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            
            # user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(time_spent_each_act.copy())
            features.update(event_code_count.copy())
            features['session_title'] = session['title'].iloc[0]
            
            #아래 4줄은 게임유저가 시도했었던 기록들(attempts)을 더해줍니다.
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts
            accumulated_uncorrect_attempts += false_attempts
            
            # 앱 사용시간 입니다
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1,2] - session.iloc[0,2]).seconds)
            
            # 정확도는 항상 승리하는것을 모든 시도로 나눈 값이다.
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter >0 else 0
            accuracy = true_attempts/(true_attempts + false_attempts) if (true_attempts + false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            
            # 플에이어가 어느 accuracy 그룹에 속해져있는지 count한것이 현재 accuracy로 분류되어져있다
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # 플레이어의 정확도 그룹 평균
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter >0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            
            #플레이어는 얼마나 많은 엑션을 취했는가? , 그것은 0으로 초기화되고 아래 라인에서 업데이트된다
            features['accumulated_actions'] = accumulated_actions
            
            #test set인경우 dataset에 feature들이 삽입될수있는 조건들이 있고, 모든 session은 train인 final datasets에 속하며
            
            #다음과같은 클라우슬을 거쳐야함: session.query(f'event_code == {win_code[session_title]}'), 즉 event_code에 4100 or 4110이 존재해야함
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts >0:
                all_assessments.append(features)
            
            counter += 1
        
           #지금까지 각 event_code에서 얼마나 많이 action이 세어졌는지 보겠습니다
        n_of_event_codes = Counter(session['event_code'])
        
        # 동일한 이름의 feature에서 사용되어진 actions들이 얼마나 많은지 세어보겠습니다.
        for key in n_of_event_codes.keys():
            event_code_count[key] += n_of_event_codes[key]
            
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activity = session_type
        # test_set이 아닐경우 이전에 스크랩되어진 마지막 assessment만이 예측됨
        if test_set:
            return all_assessments[-1]
        return all_assessments

In [None]:
def get_data(user_sample, test_set=False):
    '''
    The user_sample is a DataFrame from train or test where the only one 
    installation_id is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    # Constants and parameters declaration
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    
    # news features: time spent in each activity
    time_spent_each_act = {actv: 0 for actv in list_of_user_activities}
    event_code_count = {eve: 0 for eve in list_of_event_code}
    last_session_time_sec = 0
    
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        
        # get current session time in seconds
        if session_type != 'Assessment':
            time_spent = int(session['game_time'].iloc[-1] / 1000)
            time_spent_each_act[activities_labels[session_title]] += time_spent
        
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(time_spent_each_act.copy())
            features.update(event_code_count.copy())
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session['title'].iloc[0] 
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            # the accurace is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        # this piece counts how many actions was made in each event_code so far
        n_of_event_codes = Counter(session['event_code'])
        
        for key in n_of_event_codes.keys():
            event_code_count[key] += n_of_event_codes[key]

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

In [None]:
compiled_data = []
# tqdm is the library that draws the status bar below
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=17000):
    # user_sample is a DataFrame that contains only one installation_id
    compiled_data += get_data(user_sample)

In [None]:
new_train = pd.DataFrame(compiled_data)
del compiled_data
new_train.shape

아래 features들은 제가 만든것들입니다. 각 event들은 앞서 진행된다는것에 유의하자, 예를들어 첫번째 row는 assessment전을 보여준다 -> player가 3번 clip을 보고 3먼 activities를 하고, 4 game을 플레이한다음 0번의 assessment를 달성.. 등등..

In [None]:
pd.set_option('display.max_columns', None)
new_train[:]

## Model

In [None]:
# 다음 list는 포괄적으로 입력 데이터 집합 X를 제외한 accuracy_group에 사용될 feature list를 생성하며, 이것은 라벨 y로 할것입니다
all_features = [x for x in new_train.columns if x not in ['accuracy_group']]

# cat_features는 모델에 잘 훈련될수있도록 파라미터들을 선언해야함
cat_features = ['session_title']

# X와 y값을 나눔
X, y = new_train[all_features], new_train['accuracy_group']
del train

X.shape


In [None]:
# 모델과 파라미터를 만드는 함수를 만들었습니다
# 다른파라미터는 아래 documentation을 참조하였습니다.
# https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html
def make_classifier(iterations = 5000):
    clf = CatBoostClassifier(loss_function = 'MultiClass',
                            eval_metric = 'WKappa',
                            task_type = 'CPU',
                            #learning_rate = 0.01,
                            iterations = iterations,
                            od_type = 'Iter',
                            depth = 8,
                            early_stopping_rounds = 500,
                            random_seed = 2019,
                            use_best_model=True,
                            border_count = 128,
                            
                            )
    return clf

In [None]:
from sklearn.model_selection import KFold
# oof는 input dataset과 동일한 크기의 0행렬입니다
oof = np.zeros(len(X))
NFOLDS = 5
# KFold class는 n 개의 다른 training, validation sets를 나눠줍니다
# 이 기술은 모델의 오버피팅을 방지하고 보이지않는 데이터를 잘 학습할수 있도록 사용된다. 
#split/fold의 수가 많을수록 test에 무작위성으로 영향을 끼치는것이 줄어듭니다
folds = KFold(n_splits = NFOLDS, shuffle = True, random_state = 2019)
training_start_time = time()
models = []

for fold, (trn_idx, test_idx) in enumerate(folds.split(X,y)):
    # folds.split의 각 iteration은 새로운 training data와 validation data의 index array를 반환한다
    start_time = time()
    print(f'Training on fold {fold+1}')
    # 모델 생성
    clf = make_classifier()
    # 전체 데이터에서 선택되어진 index와 사용된 features들을 .loc를 이용하여 모델에 적합하게 해줌
    clf.fit(X.loc[trn_idx, all_features], y.loc[trn_idx], eval_set = (X.loc[test_idx, all_features], y.loc[test_idx]), use_best_model = True, verbose = 500, cat_features = cat_features)
    # oof 행렬에 split된 각 에측값을 넣어줌
    oof[test_idx] = clf.predict(X.loc[test_idx, all_features]).reshape(len(test_idx))
    models.append(clf)
    print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds = time() - start_time))))
    print('____________________________________________________________________________________________\n')
    
print('-' * 30)
# and here, the complete oof is tested against the real data using que metric (quadratic weighted kappa)
print('OOF QWK:', qwk(y, oof))
print('-' * 30)

In [None]:
fea_imp = pd.DataFrame({'imp': clf.feature_importances_, 'col':X.columns})
fea_imp.set_index(['col'], inplace = True)
fea_imp = fea_imp.sort_values(['imp','col'], ascending = True,)
fea_imp.plot(kind = 'barh', figsize = (15,20))

In [None]:
X

In [None]:
del X, y

In [None]:
new_test = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), total = 1000):
    a = get_data(user_sample, test_set = True)
    new_test.append(a)
    
X_test = pd.DataFrame(new_test)
del test

In [None]:
predictions = []

for model in models:
    predictions.append(model.predict(X_test))
predictions = np.concatenate(predictions, axis = 1)
print(predictions.shape)
predictions = stats.mode(predictions, axis =1)[0].reshape(-1)
print(predictions.shape)
    

In [None]:
submission['accuracy_group'] = np.round(predictions).astype('int')
submission.to_csv('submission.csv', index = None)