In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV  #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

data = pd.read_csv('./data/aggregate_201019.csv')
target = 'reuse_7'
IDcol = 'project'
data.dropna(inplace=True)
data.pop('maven_release')
data.pop('release')

In [None]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)
        print(cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target],eval_metric='mlogloss')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
    #print(alg)
    #print(alg.booster())
    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    predictors = feat_imp.index.tolist()
    plt.ylabel('Feature Importance Score')
    feat_imp2 = feat_imp.to_frame()
    for predictor in predictors:
        if feat_imp2[0][predictor] <10:
            print("Removed")
            print(predictor)
            predictors.remove(predictor)
    return predictors
    #return alg

In [None]:
def f(x):
    arr = [29, 50.6, 160, 600.2,964.12, 3066.38]
    for i in range(len(arr)+1):
        if i == len(arr):
            return int(i)
        if x['maven_reuse']<arr[i]:
            return int(i) 
data['reuse_7'] = data.apply(f, axis=1)

In [None]:
data.pop('maven_reuse')

In [None]:
predictors = [x for x in data.columns if x not in [target, IDcol]]

In [None]:
for i in data.reuse_7:
    if i == 6:
        print(True)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2)

In [None]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 num_class = 7,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
predictors = modelfit(xgb1, train, predictors)

In [None]:
param_test1 = {
 'max_depth':range(3,10,1),
 'min_child_weight':range(1,6,1)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=21, max_depth=5,
 min_child_weight=1, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob',num_class=7, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='neg_log_loss',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {
 'max_depth': [2, 3, 4, 5],
 'min_child_weight': [1, 2, 3, 4,5, 6, 7, 8, 9]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=21, max_depth=5,
 min_child_weight=2, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1,seed=27, num_class=7), 
 param_grid = param_test2, scoring='neg_log_loss',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])
gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test2b = {
 'min_child_weight':[1, 2,4,6,8,10,12,14,16, 21, 22, 23, 24, 25]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=21, max_depth=2,
 min_child_weight=2, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1,seed=27, num_class=7), 
 param_grid = param_test2b, scoring='neg_log_loss',n_jobs=4,iid=False, cv=5)
gsearch2b.fit(train[predictors],train[target])

In [None]:
gsearch2b.best_params_, gsearch2b.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=21, max_depth=3,
 min_child_weight=5, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1,seed=27, num_class=7), 
 param_grid = param_test3, scoring='neg_log_loss',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])

In [None]:
gsearch3.best_params_, gsearch3.best_score_

In [None]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=3,
 min_child_weight=5,
 gamma=0.1,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softprob',
 nthread=4,
 scale_pos_weight=1,
 num_class=7,
 seed=27)
predictors = modelfit(xgb2, train, predictors)

In [None]:
print(len(predictors))

In [None]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=22, max_depth=3,
 min_child_weight=5, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1,seed=27, num_class=7), 
 param_grid = param_test4, scoring='neg_log_loss',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(55,75,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=22, max_depth=3,
 min_child_weight=5, gamma=0.1, subsample=0.9, colsample_bytree=0.6,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1,seed=27, num_class=7), 
 param_grid = param_test5, scoring='neg_log_loss',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
gsearch5.best_score_, gsearch5.best_params_

In [None]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=22, max_depth=3,
 min_child_weight=5, gamma=0.1, subsample=0.85, colsample_bytree=0.7,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1,seed=27, num_class=7), 
 param_grid = param_test6, scoring='neg_log_loss',n_jobs=4,iid=False, cv=5)
gsearch6.fit(train[predictors],train[target])
gsearch6.best_params_, gsearch6.best_score_

In [None]:
param_test7 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=22, max_depth=3,
 min_child_weight=5, gamma=0.1, subsample=0.85, colsample_bytree=0.7,
 objective= 'multi:softprob', nthread=4, scale_pos_weight=1,seed=27, num_class=7), 
 param_grid = param_test7, scoring='neg_log_loss',n_jobs=4,iid=False, cv=5)
gsearch7.fit(train[predictors],train[target])
gsearch7.best_params_, gsearch7.best_score_

In [None]:
xgb3 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=3,
 min_child_weight=5,
 gamma=0.1,
 subsample=0.85,
 colsample_bytree=0.7,
 reg_alpha=0.01,
 objective= 'multi:softprob',
 num_class=7,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
predictors = modelfit(xgb3, train, predictors)

In [None]:
len(predictors)

In [None]:
predictors.sort()

In [None]:
xgb4 = modelfit(xgb3, train, predictors)

N estimators = 22, depth =3, child weight = 5, gamma 0.1, subsample=0.85, colsample=0.7

In [None]:
print(xgb3)

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
for i in train.columns:
    if i not in predictors:
        train.pop(i)

In [None]:
train.info()

In [None]:
train3 = train.reindex(sorted(train.columns), axis=1)

In [None]:
train3.head()

In [None]:
predictors

In [None]:
for i in test.columns:
    if i not in predictors:
        test.pop(i)

In [None]:
test2 = test.reindex(sorted(test.columns), axis=1)

In [None]:
test2.head()

In [None]:
y_pred = xgb3.predict(test)

In [None]:
d = ['synchronizedMethodsQty_average', 'synchronizedMethodsQty_max', 'lcom_stdev', 'publicMethodsQty_sum', 'publicMethodsQty_average', 'publicMethodsQty_stdev', 'publicMethodsQty_max', 'staticFieldsQty_average', 'staticFieldsQty_median', 'lambdasQty_stdev', 'logStatementsQty_average', 'logStatementsQty_stdev', 'tryCatchQty_average', 'wmc_stdev', 'rfc_average', 'defaultMethodsQty_average', 'nosi_average', 'uniqueWordsQty_stdev', 'uniqueWordsQty_median', 'stringLiteralsQty_average', 'anonymousClassesQty_average', 'mathOperationsQty_average', 'modifiers_average', 'lcc_sum', 'lcc_average', 'protectedFieldsQty_average', 'protectedFieldsQty_stdev', 'numbersQty_average', 'staticMethodsQty_average', 'staticMethodsQty_stdev', 'dit_average', 'loopQty_sum', 'loopQty_average', 'loc_stdev', 'loc_max', 'totalFieldsQty_sum', 'cbo_average', 'protectedMethodsQty_sum', 'privateFieldsQty_stdev', 'finalMethodsQty_stdev', 'innerClassesQty_sum', 'innerClassesQty_average', 'innerClassesQty_stdev']

In [None]:
c = ['anonymousClassesQty_average', 'cbo_average', 'defaultMethodsQty_average', 'dit_average', 'finalMethodsQty_stdev', 'innerClassesQty_average', 'innerClassesQty_stdev', 'innerClassesQty_sum', 'lambdasQty_stdev', 'lcc_average', 'lcc_sum', 'lcom_stdev', 'loc_max', 'loc_stdev', 'logStatementsQty_average', 'logStatementsQty_stdev', 'loopQty_average', 'loopQty_sum', 'mathOperationsQty_average', 'modifiers_average', 'nosi_average', 'numbersQty_average', 'privateFieldsQty_stdev', 'protectedFieldsQty_average', 'protectedFieldsQty_stdev', 'protectedMethodsQty_sum', 'publicMethodsQty_average', 'publicMethodsQty_max', 'publicMethodsQty_stdev', 'publicMethodsQty_sum', 'rfc_average', 'staticFieldsQty_average', 'staticFieldsQty_median', 'staticMethodsQty_average', 'staticMethodsQty_stdev', 'stringLiteralsQty_average', 'synchronizedMethodsQty_average', 'synchronizedMethodsQty_max', 'totalFieldsQty_sum', 'tryCatchQty_average', 'uniqueWordsQty_median', 'uniqueWordsQty_stdev', 'wmc_stdev']

In [None]:
d.sort()

In [None]:
c.sort()

In [None]:
d == c