In [58]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [59]:
user = pd.read_csv(".\\data\\user_data.csv")
problem = pd.read_csv(".\\data\\problem_data.csv")
train_submission= pd.read_csv(".\\data\\train_submissions.csv")
test_submission = pd.read_csv(".\\data\\test.csv")
test_submission.head(5)

Unnamed: 0,ID,user_id,problem_id
0,user_856_prob_5822,user_856,prob_5822
1,user_2642_prob_2334,user_2642,prob_2334
2,user_2557_prob_2920,user_2557,prob_2920
3,user_1572_prob_4598,user_1572,prob_4598
4,user_295_prob_6139,user_295,prob_6139


In [60]:
train_target = train_submission['attempts_range']
# print train_submission.shape
train = train_submission.loc[:, train_submission.columns != 'attempts_range']
test = test_submission[['user_id', 'problem_id']]
print train.columns
print test.columns
print train.shape
print test.shape

Index([u'user_id', u'problem_id'], dtype='object')
Index([u'user_id', u'problem_id'], dtype='object')
(155295, 2)
(66555, 2)


In [61]:
train_len = train.shape[0]
train_len

155295

In [62]:
total_data = train.append(test)
total_data.reset_index(drop=True, inplace=True)
total_data.shape

(221850, 2)

In [63]:
total_data = pd.merge(total_data, user, on='user_id')
total_data = pd.merge(total_data, problem, on='problem_id')
# total_data.to_csv("total_data.csv")

In [64]:
total_data.dtypes

user_id                       object
problem_id                    object
submission_count               int64
problem_solved                 int64
contribution                   int64
country                       object
follower_count                 int64
last_online_time_seconds       int64
max_rating                   float64
rating                       float64
rank                          object
registration_time_seconds      int64
level_type                    object
points                       float64
tags                          object
dtype: object

In [65]:
user_id = total_data['user_id']
problem_id = total_data['problem_id']
total_data = total_data.loc[:, total_data.columns != 'user_id']
total_data = total_data.loc[:, total_data.columns != 'problem_id']
total_data = total_data.loc[:, total_data.columns != 'tags']
total_data = total_data.loc[:, total_data.columns != 'points']

In [66]:
total_data['sub_to_prob'] = total_data['problem_solved'] / total_data['submission_count']
total_data['last_online_time_seconds'] = total_data['last_online_time_seconds']/3600

In [67]:
col_to_rem = ['submission_count', 'problem_solved', 'country', 'max_rating', 'rating', 'registration_time_seconds']
for c in col_to_rem:
    total_data = total_data.loc[:, total_data.columns != c]

In [68]:
total_data.dtypes
total_data.shape

(221850, 6)

In [69]:
col_to_encode = ['rank', 'level_type']

In [70]:
# total_data['country'] = total_data['country'].fillna('India')
total_data['rank'] = total_data['rank'].fillna('intermediate')
total_data['level_type'] = total_data['level_type'].fillna('C')

In [71]:
# for c in col_to_encode:
#     print c
#     print total_data[c].value_counts()

In [72]:
from sklearn.preprocessing import LabelEncoder
for c in col_to_encode:
    le = LabelEncoder()
    total_data[c] = le.fit_transform(total_data[c])
    print le.classes_

['advanced' 'beginner' 'expert' 'intermediate']
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N']


In [73]:
for c in total_data.columns:
    print total_data[c].isnull().any(), c

False contribution
False follower_count
False last_online_time_seconds
False rank
False level_type
False sub_to_prob


In [74]:
print total_data.shape
print total_data.columns

(221850, 6)
Index([u'contribution', u'follower_count', u'last_online_time_seconds',
       u'rank', u'level_type', u'sub_to_prob'],
      dtype='object')


In [75]:
train_data = total_data.iloc[:train_len, :]
submission_data = total_data.iloc[train_len:, :]
print train.shape
print submission_data.shape

(155295, 2)
(66555, 6)


In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data,train_target, test_size=0.2, random_state=2017)

In [77]:
print X_train.shape
print X_train.columns
print X_test.shape
print X_test.columns
print y_train.shape
# print y_train.columns
print pd.DataFrame(y_test)['attempts_range'].value_counts()

(124236, 6)
Index([u'contribution', u'follower_count', u'last_online_time_seconds',
       u'rank', u'level_type', u'sub_to_prob'],
      dtype='object')
(31059, 6)
Index([u'contribution', u'follower_count', u'last_online_time_seconds',
       u'rank', u'level_type', u'sub_to_prob'],
      dtype='object')
(124236L,)
1    16393
2     9564
3     2880
4     1118
6      627
5      477
Name: attempts_range, dtype: int64


In [78]:
X_train_array = X_train.as_matrix()
X_test_array = X_test.as_matrix()
# y_train = np.array()

In [79]:
y_test = y_test-1
y_train = y_train-1
# y_test

In [80]:
dtrain = lgb.Dataset(X_train_array, label=y_train)
dtest = lgb.Dataset(X_test_array, label=y_test)

In [86]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_leaves': 32,
    'learning_rate': 0.8,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1,
    'max_bin': 64,
    'subsample_for_bin': 200,
    'subsample': 1, 
    'subsample_freq': 1, 
    'colsample_bytree': 0.8, 
    'reg_alpha': 5, 
    'reg_lambda': 10,
    'min_split_gain': 0.5, 
    'min_child_weight': 1, 
    'min_child_samples': 5, 
    'scale_pos_weight': 1,
    'num_class' : 6
    }
gbm = lgb.train(params, dtrain, num_boost_round=200, valid_sets=dtest, early_stopping_rounds=20)

[1]	valid_0's multi_logloss: 1.44025
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's multi_logloss: 1.3967
[3]	valid_0's multi_logloss: 1.36753
[4]	valid_0's multi_logloss: 1.36422
[5]	valid_0's multi_logloss: 1.36279
[6]	valid_0's multi_logloss: 1.3624
[7]	valid_0's multi_logloss: 1.36216
[8]	valid_0's multi_logloss: 1.36223
[9]	valid_0's multi_logloss: 1.36238
[10]	valid_0's multi_logloss: 1.36248
[11]	valid_0's multi_logloss: 1.36236
[12]	valid_0's multi_logloss: 1.36235
[13]	valid_0's multi_logloss: 1.36235
[14]	valid_0's multi_logloss: 1.36237
[15]	valid_0's multi_logloss: 1.36236
[16]	valid_0's multi_logloss: 1.36245
[17]	valid_0's multi_logloss: 1.36258
[18]	valid_0's multi_logloss: 1.36258
[19]	valid_0's multi_logloss: 1.36258
[20]	valid_0's multi_logloss: 1.36258
[21]	valid_0's multi_logloss: 1.36258
[22]	valid_0's multi_logloss: 1.36258
[23]	valid_0's multi_logloss: 1.36258
[24]	valid_0's multi_logloss: 1.36258
[25]	valid_0's multi_logloss: 1.36258

In [82]:
# print submission_data.columns
# dsubmission = xgb.DMatrix(submission_data.as_matrix())
# pred = clf_xgb.predict(dsubmission)
# pred = pred+1
# pred
pred = gbm.predict(submission_data.as_matrix())

In [83]:
pred_ = np.argmax(pred, axis=1)
pred_ = pred_ + 1
# pred_

In [84]:
submission_df = pd.concat([test_submission['ID'], pd.DataFrame(pred_)], axis=1)
submission_df.columns = ['ID', 'attempts_range']
submission_df.to_csv("test_predictions.csv", index=False)