This notebook corresponds to Kaggle's datascience bowl challenge.

Import necessary libraries

In [13]:
import numpy as np
import csv
from datetime import datetime, timedelta

Load Training Data

In [10]:
file_name = './Data/train_short.csv'
f = open(file_name,'r',encoding='utf-8')
reader = csv.reader(f)
data_ub = list(reader)

Extract features:
   * installation_id: an integer (converted from hex)
   * event_id: an integer (converted from hex)
   * event_year
   * event_month
   * event_day_of_month
   * event_day_of_week
   * event_hour
   * event_minute
   * event_day_section
   * event_count
   * event_code
   * game_time
   * game_title_ind
   * game_type_ind
   * world

In [24]:
TITLES_MAPPING = {}
title_ind = 1

TYPES_MAPPING = {}
type_ind = 1

WORLDS_MAPPING = {}
world_ind = 1

In [31]:
no_data_points = len(data_ub)
no_features = 16
training_data = np.zeros([no_data_points,no_features])
titles_list = []
for itr in range(1,len(data_ub)):
    row = data_ub[itr]
    event_id = int(row[0],16)
    installation_id = int(row[4],16)
    game_session = int(row[1],16)
    timestamp = datetime.strptime(row[2][0:19], '%Y-%m-%dT%H:%M:%S')
    event_year = timestamp.year
    event_month = timestamp.month
    event_day_of_month = timestamp.day
    event_day_of_week = timestamp.weekday()
    event_hour = timestamp.hour
    event_minute = timestamp.minute
    if event_hour < 8:
        event_day_section = 1
    elif event_hour < 12:
        event_day_section = 2
    elif event_hour < 16:
        event_day_section = 3
    elif event_hour < 20:
        event_day_section = 4
    else:
        event_day_section = 5
        
    event_count = int(row[5])
    event_code = int(row[6])
    
    game_time = int(row[7])
    
    game_title = row[8]
    if game_title not in TITLES_MAPPING:
        title_ind += 1
        TITLES_MAPPING[game_title] = title_ind
    game_title_ind = TITLES_MAPPING[game_title]
    
    game_type = row[9]
    if game_type not in TYPES_MAPPING:
        type_ind += 1
        TYPES_MAPPING[game_type] = type_ind
    game_type_ind = TYPES_MAPPING[game_type]
    
    game_world = row[10]
    if game_world not in WORLDS_MAPPING:
        world_ind += 1
        WORLDS_MAPPING[game_world] = world_ind
    game_world_ind = WORLDS_MAPPING[game_world]
    
    data_point = [game_session,installation_id,event_id,event_year,event_month,
                  event_day_of_month,event_day_of_week,event_hour,
                 event_minute,event_day_section,event_count,event_code,
                 game_time,game_title_ind,game_type_ind,game_world_ind]
    
    training_data[itr-1,:] = data_point

**TODO**: Process the training data to compute number of corrects and incorrects

**TODO**: Do a sanity-check to see if the processed file matches the results in train_label file

Create training labels

In [33]:
file_name = './Data/train_labels_short.csv'
f = open(file_name,'r',encoding='utf-8')
reader = csv.reader(f)
data_ub = list(reader)

training_labels = np.zeros([no_data_points,6])
titles_list = []
for itr in range(1,len(data_ub)):
    row = data_ub[itr]
    installation_id = int(row[4],16)
    game_session = int(row[1],16)
    no_corrects = int(row[3])
    no_incorrects = int(row[4])
    accuracy = float(row[5])
    accuracy_group = int(row[6])
    
    label_point = [installation_id,game_session,no_corrects,no_incorrects,accuracy,accuracy_group]
    training_labels[itr-1,:] = label_point

**TODO**: Analyze feature correlations and importance