In [105]:
import pandas as pd
import numpy as np

In [106]:
user = pd.read_csv(".\\data\\user_data.csv")
problem = pd.read_csv(".\\data\\problem_data.csv")
train_submission= pd.read_csv(".\\data\\train_submissions.csv")
test_submission = pd.read_csv(".\\data\\test.csv")
test_submission.head(5)

Unnamed: 0,ID,user_id,problem_id
0,user_856_prob_5822,user_856,prob_5822
1,user_2642_prob_2334,user_2642,prob_2334
2,user_2557_prob_2920,user_2557,prob_2920
3,user_1572_prob_4598,user_1572,prob_4598
4,user_295_prob_6139,user_295,prob_6139


In [107]:
train_target = train_submission['attempts_range']
# print train_submission.shape
train = train_submission.loc[:, train_submission.columns != 'attempts_range']
test = test_submission[['user_id', 'problem_id']]
print train.columns
print test.columns
print train.shape
print test.shape

Index([u'user_id', u'problem_id'], dtype='object')
Index([u'user_id', u'problem_id'], dtype='object')
(155295, 2)
(66555, 2)


In [108]:
train_len = train.shape[0]
train_len

155295

In [109]:
total_data = train.append(test)
total_data.reset_index(drop=True, inplace=True)
total_data.shape

(221850, 2)

In [110]:
total_data = pd.merge(total_data, user, on='user_id')
total_data = pd.merge(total_data, problem, on='problem_id')
# total_data.to_csv("total_data.csv")

In [111]:
total_data.dtypes

user_id                       object
problem_id                    object
submission_count               int64
problem_solved                 int64
contribution                   int64
country                       object
follower_count                 int64
last_online_time_seconds       int64
max_rating                   float64
rating                       float64
rank                          object
registration_time_seconds      int64
level_type                    object
points                       float64
tags                          object
dtype: object

In [112]:
user_id = total_data['user_id']
problem_id = total_data['problem_id']

#Remove columns with ids and incomplete/sparse data
total_data = total_data.loc[:, total_data.columns != 'user_id']
total_data = total_data.loc[:, total_data.columns != 'problem_id']
total_data = total_data.loc[:, total_data.columns != 'tags']
total_data = total_data.loc[:, total_data.columns != 'points']

In [113]:
total_data.dtypes
total_data.shape

(221850, 11)

In [114]:
col_to_encode = ['country', 'rank', 'level_type']

In [115]:
total_data['country'] = total_data['country'].fillna('India')
total_data['rank'] = total_data['rank'].fillna('beginner')
total_data['level_type'] = total_data['level_type'].fillna('C')

In [116]:
#Encode categorical data
from sklearn.preprocessing import LabelEncoder
for c in col_to_encode:
    le = LabelEncoder()
    total_data[c] = le.fit_transform(total_data[c])
    print le.classes_

['Argentina' 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bangladesh'
 'Belarus' 'Belgium' 'Bolivia' 'Bosnia and Herzegovina' 'Brazil' 'Bulgaria'
 'Canada' 'Chile' 'China' 'Christmas Island' 'Colombia' 'Costa Rica'
 'Croatia' 'Cuba' 'Czechia' 'Egypt' 'Estonia' 'Finland' 'France' 'Georgia'
 'Germany' 'Haiti' 'Hong Kong' 'Hungary' 'Iceland' 'India' 'Indonesia'
 'Iran' 'Israel' 'Italy' 'Japan' 'Jordan' 'Kazakhstan' 'Kyrgyzstan' 'Laos'
 'Latvia' 'Lebanon' 'Lithuania' 'Macedonia' 'Malaysia' 'Mexico' 'Moldova'
 'Mongolia' 'Morocco' 'Netherlands' 'North Korea' 'Norway' 'Peru'
 'Philippines' 'Poland' 'Romania' 'Russia' 'Serbia' 'Singapore' 'Slovakia'
 'South Africa' 'South Korea' 'Spain' 'Swaziland' 'Switzerland' 'Syria'
 'Taiwan' 'Tajikistan' 'Thailand' 'Trinidad and Tobago' 'Tunisia'
 'Turkmenistan' 'Ukraine' 'United Kingdom' 'United States' 'Uzbekistan'
 'Venezuela' 'Vietnam']
['advanced' 'beginner' 'expert' 'intermediate']
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N']


In [117]:
# for c in total_data.columns:
#     print total_data[c].isnull().any(), c

False submission_count
False problem_solved
False contribution
False country
False follower_count
False last_online_time_seconds
False max_rating
False rating
False rank
False registration_time_seconds
False level_type


In [118]:
print total_data.shape
print total_data.columns

(221850, 11)
Index([u'submission_count', u'problem_solved', u'contribution', u'country',
       u'follower_count', u'last_online_time_seconds', u'max_rating',
       u'rating', u'rank', u'registration_time_seconds', u'level_type'],
      dtype='object')


In [119]:
#split data into training set and submission set after preprocessing 
train_data = total_data.iloc[:train_len, :]
submission_data = total_data.iloc[train_len:, :]
print train.shape
print submission_data.shape

(155295, 2)
(66555, 11)


In [120]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data,train_target, test_size=0.33, random_state=2017)

In [121]:
print X_train.shape
print X_train.columns
print X_test.shape
print X_test.columns
print y_train.shape
# print y_train.columns
print pd.DataFrame(y_test)['attempts_range'].value_counts()

(104047, 11)
Index([u'submission_count', u'problem_solved', u'contribution', u'country',
       u'follower_count', u'last_online_time_seconds', u'max_rating',
       u'rating', u'rank', u'registration_time_seconds', u'level_type'],
      dtype='object')
(51248, 11)
Index([u'submission_count', u'problem_solved', u'contribution', u'country',
       u'follower_count', u'last_online_time_seconds', u'max_rating',
       u'rating', u'rank', u'registration_time_seconds', u'level_type'],
      dtype='object')
(104047L,)
1    27170
2    15663
3     4701
4     1869
6     1039
5      806
Name: attempts_range, dtype: int64


In [122]:
X_train_array = X_train.as_matrix()
X_test_array = X_test.as_matrix()
# y_train = np.array()

In [123]:
y_test = y_test-1
y_train = y_train-1
# y_test

In [124]:
dtrain = xgb.DMatrix(X_train_array, label=y_train)
dtest = xgb.DMatrix(X_test_array, label=y_test)

In [125]:
params = {'objective': 'multi:softmax',
          'eval_metric': 'merror',
          'num_class' : 6,
          'nthread': 4,
          'silent': 1,
          'max_depth': 3,
          'subsample': 0.9,
          'min_child_weight': 5,
          "colsample_bytree": 0.9,
          'eta': 0.001,
          'seed': 2017}
watchlist = [(dtrain, 'train'), (dtest, 'test')]
num_rounds = 100
clf_xgb = xgb.train(params, dtrain, num_rounds, verbose_eval=5, evals=watchlist, early_stopping_rounds=20)

[0]	train-merror:0.465222	test-merror:0.469872
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 20 rounds.
[5]	train-merror:0.465299	test-merror:0.469833
[10]	train-merror:0.465299	test-merror:0.469833
[15]	train-merror:0.465299	test-merror:0.469833
[20]	train-merror:0.465299	test-merror:0.469833
Stopping. Best iteration:
[1]	train-merror:0.465299	test-merror:0.469833



In [126]:
print submission_data.columns
dsubmission = xgb.DMatrix(submission_data.as_matrix())
pred = clf_xgb.predict(dsubmission)
pred = pred+1
pred

Index([u'submission_count', u'problem_solved', u'contribution', u'country',
       u'follower_count', u'last_online_time_seconds', u'max_rating',
       u'rating', u'rank', u'registration_time_seconds', u'level_type'],
      dtype='object')


array([ 1.,  1.,  1., ...,  1.,  1.,  1.], dtype=float32)

In [127]:
submission_df = pd.concat([test_submission['ID'], pd.DataFrame(pred)], axis=1)
submission_df.columns = ['ID', 'attempts_range']
submission_df.to_csv("test_predictions.csv", index=False)