In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
import time


In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
#load data
train_csv = pd.read_csv('/home/tharun/Downloads/cascade_cup/train_age_dataset.csv')
test_csv = pd.read_csv('/home/tharun/Downloads/cascade_cup/test_age_dataset.csv')
sample_submission = pd.read_csv('/home/tharun/Downloads/cascade_cup/sample_submission.csv')

In [None]:
train_csv.var()
#low var - 
# num_of_hashtags_per_action         1.493600e-04
# emoji_count_per_action             4.154012e-04
# num_of_comments                    9.339641e-04

#high var
# Unnamed: 0                         2.458879e+10
# userId                             3.318461e+14
# avgTimeSpent                       3.066408e+09
# avgDuration                        5.895105e+03
# avgComments                        5.851986e+01
# avgt2                              9.643022e+05

In [None]:
test_csv.var()
#low var
#num_of_hashtags_per_action         2.411646e-04
# emoji_count_per_action             3.047195e-04
# num_of_comments                    9.694261e-04

#high var
# Unnamed: 0                         2.458759e+10
# userId                             3.305619e+14
# avgTimeSpent                       6.013404e+04
# avgDuration                        5.463003e+03
# avgComments                        2.121815e+01
# avgt2                              8.467883e+05

In [None]:
#upsampling
smote=SMOTE(random_state=42)

In [None]:
#data perep
y = train_csv['age_group']
train_csv.columns, test_csv.columns

In [None]:
#data prep
y = np.array(train_csv.age_group.tolist())     
train_csv = train_csv.drop(['age_group'],1)
X = np.array(train_csv.values) 

In [None]:
#smote
X_res, y_res = smote.fit_sample(X,y)

In [None]:
len(X_res),len(X)

In [None]:
#train val split
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.1, random_state=0, stratify=y_res, shuffle=True)

#normalizing - scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
test = sc.transform(test_csv.values)

In [None]:
len(X_train),len(X_train[0]),len(X_val)

## Model training 

In [None]:
#xgb params
params_final = {'learning_rate': 0.1, 'objective': 'multi:softmax', 'max_depth': 6, 'min_child_weight': 6, 'subsample': 0.8,'num_class':5,'colsample_bytree':0.4,
 'reg_alpha':0.05}

dmatrix xgboost

In [None]:
start = time.time()
#train val dmat
xgdmat_train = xgb.DMatrix(X_train, y_train)
xgdmat_val = xgb.DMatrix(X_val, y_val)
#dict
dct = {
    xgdmat_train:'train',
    xgdmat_val:'eval',
    'eval_metric' : 'mlogloss'
}
#eval list
evals=[(xgdmat_train, 'train'), (xgdmat_val, 'valid')]
#training
xgb_final = xgb.train(params_final, xgdmat_train, num_boost_round = 3000,
                      evals=evals,early_stopping_rounds=100,evals_result=dct ,verbose_eval=5)
print(time.time()-start)

## Run for pred and sub

In [None]:
#test dmat
xgdmat_test = xgb.DMatrix(test)

In [None]:
#predictions on val n train
y_val_pred = xgb_final.predict(xgdmat_val)
y_train_pred = xgb_final.predict(xgdmat_train)

In [None]:
#prediction on test
prediction = xgb_final.predict(xgdmat_test)

In [None]:
#print score
f1sc = f1_score(y_val,y_val_pred,average='weighted')
print('f1 validation score ',f1sc)

f1sc_train = f1_score(y_train,y_train_pred,average='weighted')
print('f1 train score ',f1sc_train)

In [None]:
#modelsave
#make submission csv file 
prediction = np.array(prediction, np.int)
pd.DataFrame(prediction, columns=['prediction']).to_csv('prediction80988.csv')

In [None]:
#save model
xgb_final.save_model("xgbmodel__fe2d")