In [72]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [73]:
user = pd.read_csv(".\\data\\user_data.csv")
problem = pd.read_csv(".\\data\\problem_data.csv")
train_submission= pd.read_csv(".\\data\\train_submissions.csv")
test_submission = pd.read_csv(".\\data\\test.csv")
test_submission.head(5)

Unnamed: 0,ID,user_id,problem_id
0,user_856_prob_5822,user_856,prob_5822
1,user_2642_prob_2334,user_2642,prob_2334
2,user_2557_prob_2920,user_2557,prob_2920
3,user_1572_prob_4598,user_1572,prob_4598
4,user_295_prob_6139,user_295,prob_6139


In [74]:
train_target = train_submission['attempts_range']
# print train_submission.shape
train = train_submission.loc[:, train_submission.columns != 'attempts_range']
test = test_submission[['user_id', 'problem_id']]
print train.columns
print test.columns
print train.shape
print test.shape

Index([u'user_id', u'problem_id'], dtype='object')
Index([u'user_id', u'problem_id'], dtype='object')
(155295, 2)
(66555, 2)


In [75]:
train_len = train.shape[0]
train_len

155295

In [76]:
total_data = train.append(test)
total_data.reset_index(drop=True, inplace=True)
total_data.shape

(221850, 2)

In [77]:
total_data = pd.merge(total_data, user, on='user_id')
total_data = pd.merge(total_data, problem, on='problem_id')
# total_data.to_csv("total_data.csv")

In [78]:
total_data.dtypes

user_id                       object
problem_id                    object
submission_count               int64
problem_solved                 int64
contribution                   int64
country                       object
follower_count                 int64
last_online_time_seconds       int64
max_rating                   float64
rating                       float64
rank                          object
registration_time_seconds      int64
level_type                    object
points                       float64
tags                          object
dtype: object

In [79]:
user_id = total_data['user_id']
problem_id = total_data['problem_id']
total_data = total_data.loc[:, total_data.columns != 'user_id']
total_data = total_data.loc[:, total_data.columns != 'problem_id']
total_data = total_data.loc[:, total_data.columns != 'tags']
total_data = total_data.loc[:, total_data.columns != 'points']

In [80]:
total_data.dtypes
total_data.shape

(221850, 11)

In [81]:
col_to_encode = ['country', 'rank', 'level_type']

In [82]:
total_data['country'] = total_data['country'].fillna('India')
total_data['rank'] = total_data['rank'].fillna('beginner')
total_data['level_type'] = total_data['level_type'].fillna('C')

In [83]:
# for c in col_to_encode:
#     print c
#     print total_data[c].value_counts()

In [84]:
from sklearn.preprocessing import LabelEncoder
for c in col_to_encode:
    le = LabelEncoder()
    total_data[c] = le.fit_transform(total_data[c])
    print le.classes_

['Argentina' 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bangladesh'
 'Belarus' 'Belgium' 'Bolivia' 'Bosnia and Herzegovina' 'Brazil' 'Bulgaria'
 'Canada' 'Chile' 'China' 'Christmas Island' 'Colombia' 'Costa Rica'
 'Croatia' 'Cuba' 'Czechia' 'Egypt' 'Estonia' 'Finland' 'France' 'Georgia'
 'Germany' 'Haiti' 'Hong Kong' 'Hungary' 'Iceland' 'India' 'Indonesia'
 'Iran' 'Israel' 'Italy' 'Japan' 'Jordan' 'Kazakhstan' 'Kyrgyzstan' 'Laos'
 'Latvia' 'Lebanon' 'Lithuania' 'Macedonia' 'Malaysia' 'Mexico' 'Moldova'
 'Mongolia' 'Morocco' 'Netherlands' 'North Korea' 'Norway' 'Peru'
 'Philippines' 'Poland' 'Romania' 'Russia' 'Serbia' 'Singapore' 'Slovakia'
 'South Africa' 'South Korea' 'Spain' 'Swaziland' 'Switzerland' 'Syria'
 'Taiwan' 'Tajikistan' 'Thailand' 'Trinidad and Tobago' 'Tunisia'
 'Turkmenistan' 'Ukraine' 'United Kingdom' 'United States' 'Uzbekistan'
 'Venezuela' 'Vietnam']
['advanced' 'beginner' 'expert' 'intermediate']
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N']


In [85]:
for c in total_data.columns:
    print total_data[c].isnull().any(), c

False submission_count
False problem_solved
False contribution
False country
False follower_count
False last_online_time_seconds
False max_rating
False rating
False rank
False registration_time_seconds
False level_type


In [86]:
print total_data.shape
print total_data.columns

(221850, 11)
Index([u'submission_count', u'problem_solved', u'contribution', u'country',
       u'follower_count', u'last_online_time_seconds', u'max_rating',
       u'rating', u'rank', u'registration_time_seconds', u'level_type'],
      dtype='object')


In [87]:
train_data = total_data.iloc[:train_len, :]
submission_data = total_data.iloc[train_len:, :]
print train.shape
print submission_data.shape

(155295, 2)
(66555, 11)


In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data,train_target, test_size=0.2, random_state=2017)

In [89]:
print X_train.shape
print X_train.columns
print X_test.shape
print X_test.columns
print y_train.shape
# print y_train.columns
print pd.DataFrame(y_test)['attempts_range'].value_counts()

(124236, 11)
Index([u'submission_count', u'problem_solved', u'contribution', u'country',
       u'follower_count', u'last_online_time_seconds', u'max_rating',
       u'rating', u'rank', u'registration_time_seconds', u'level_type'],
      dtype='object')
(31059, 11)
Index([u'submission_count', u'problem_solved', u'contribution', u'country',
       u'follower_count', u'last_online_time_seconds', u'max_rating',
       u'rating', u'rank', u'registration_time_seconds', u'level_type'],
      dtype='object')
(124236L,)
1    16393
2     9564
3     2880
4     1118
6      627
5      477
Name: attempts_range, dtype: int64


In [90]:
X_train_array = X_train.as_matrix()
X_test_array = X_test.as_matrix()
# y_train = np.array()

In [91]:
y_test = y_test-1
y_train = y_train-1
# y_test

In [92]:
dtrain = lgb.Dataset(X_train_array, label=y_train)
dtest = lgb.Dataset(X_test_array, label=y_test)

In [93]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_leaves': 64,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1,
    'max_bin': 512,
    'subsample_for_bin': 200,
    'subsample': 1, 
    'subsample_freq': 1, 
    'colsample_bytree': 0.8, 
    'reg_alpha': 5, 
    'reg_lambda': 10,
    'min_split_gain': 0.5, 
    'min_child_weight': 1, 
    'min_child_samples': 5, 
    'scale_pos_weight': 1,
    'num_class' : 6
    }
gbm = lgb.train(params, dtrain, num_boost_round=200, valid_sets=dtest, early_stopping_rounds=10)

[1]	valid_0's multi_logloss: 1.78609
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's multi_logloss: 1.78122
[3]	valid_0's multi_logloss: 1.78102
[4]	valid_0's multi_logloss: 1.78013
[5]	valid_0's multi_logloss: 1.77994
[6]	valid_0's multi_logloss: 1.77975
[7]	valid_0's multi_logloss: 1.77956
[8]	valid_0's multi_logloss: 1.77937
[9]	valid_0's multi_logloss: 1.77918
[10]	valid_0's multi_logloss: 1.779
[11]	valid_0's multi_logloss: 1.77882
[12]	valid_0's multi_logloss: 1.77882
[13]	valid_0's multi_logloss: 1.77864
[14]	valid_0's multi_logloss: 1.77846
[15]	valid_0's multi_logloss: 1.77828
[16]	valid_0's multi_logloss: 1.77811
[17]	valid_0's multi_logloss: 1.77793
[18]	valid_0's multi_logloss: 1.77776
[19]	valid_0's multi_logloss: 1.77759
[20]	valid_0's multi_logloss: 1.77742
[21]	valid_0's multi_logloss: 1.77726
[22]	valid_0's multi_logloss: 1.77273
[23]	valid_0's multi_logloss: 1.76828
[24]	valid_0's multi_logloss: 1.76392
[25]	valid_0's multi_logloss: 1.76376

In [94]:
# print submission_data.columns
# dsubmission = xgb.DMatrix(submission_data.as_matrix())
# pred = clf_xgb.predict(dsubmission)
# pred = pred+1
# pred
pred = gbm.predict(submission_data.as_matrix())

In [95]:
pred_ = np.argmax(pred, axis=1)
pred_ = pred_ + 1
pred_

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [96]:
submission_df = pd.concat([test_submission['ID'], pd.DataFrame(pred_)], axis=1)
submission_df.columns = ['ID', 'attempts_range']
submission_df.to_csv("test_predictions.csv", index=False)