In [1]:
import pandas as pd

In [2]:
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv')

In [3]:
X_train.shape

(2021448, 24)

In [4]:
import category_encoders as ce

In [5]:
target_encoder = ce.TargetEncoder()
X = target_encoder.fit_transform(X_train, y_train)
X.head()

Unnamed: 0,index,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,C14,C15,C16,C17,C18,C19,C21,dayofweek,day,hour
0,18201910,1005,0,0.11859,0.122438,0.128065,0.188735,0.182843,0.09302,0.147823,...,16615,320,50,1863,3,39,23,5,25,12
1,35141968,1005,1,0.298257,0.260324,0.180248,0.19897,0.195158,0.199443,0.174547,...,17264,320,50,1872,3,39,23,2,29,15
2,11358550,1005,0,0.11859,0.122438,0.128065,0.010814,0.023289,0.09302,0.174547,...,21191,320,50,2424,1,161,71,3,23,10
3,36241996,1005,0,0.11859,0.122438,0.128065,0.147823,0.111282,0.107986,0.174547,...,20751,320,50,1895,0,681,101,3,30,0
4,2987382,1005,0,0.20598,0.20598,0.208215,0.19897,0.195158,0.199443,0.174547,...,21724,320,50,2502,0,35,221,1,21,15


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y_train, stratify=y_train, test_size= 0.3, random_state= 42)

In [8]:
from sklearn import metrics

In [9]:
def model_evaluation(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)

    cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

    return accuracy, cnf_matrix

In [10]:
from sklearn.model_selection import cross_val_score

In [11]:
def cross_val(model, x, y, folds):
    crossvalidation = cross_val_score(model, x, y, cv=folds, n_jobs=-1)
    return crossvalidation

In [20]:
import matplotlib.pyplot as plt
def draw_roc(model, X_test, actual):
    probs = model.predict(X_test)
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(6, 6))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

    return fpr, tpr, thresholds

In [24]:
# this stores the results in dataframe for evaluating the final result.

def store_results(name, ytrain, xtrain, ytest, xtest, model, folds):

    crossvalidation = cross_val(model, xtrain, ytrain, folds)

    accuracy_test, cm1 = model_evaluation(model, xtrain, ytrain)
    TP = cm1[1,1] # true positive
    TN = cm1[0,0] # true negatives
    FP = cm1[0,1] # false positives
    FN = cm1[1,0] # false negatives
    print(f'TP value: {TP}')
    print(f'FP value: {FP}')
    recall_test = TP / (TP+FP)
    precision_test = TP / (TP+FN)

    accuracy_train, cm1 = model_evaluation(model, xtest, ytest)
    TP = cm1[1,1] # true positive
    TN = cm1[0,0] # true negatives
    FP = cm1[0,1] # false positives
    FN = cm1[1,0] # false negatives
    recall_train = TP / (TP+FP)
    precision_train = TP / (TP+FN)

    entry = {'Model': [name],
          'Accuracy_train': [accuracy_train],
          'recall_train': [recall_train],
          'precision_train': [precision_train],
          'Accuracy_test': [accuracy_train],
          'recall_test': [recall_test],
          'precision_test': [precision_test],
          'CrossVal_Mean': [crossvalidation.mean()],
          'CrossVal1': [crossvalidation[0]],
          'CrossVal2': [crossvalidation[1]],
          'CrossVal3': [crossvalidation[2]],
          'CrossVal4': [crossvalidation[3]],
          'CrossVal5': [crossvalidation[4]],
          }
    result = pd.DataFrame(entry)
    return result



In [25]:
from sklearn.dummy import DummyClassifier

In [30]:
outcome = pd.DataFrame()
dummy_clf = DummyClassifier(strategy= "most_frequent")
dummy_clf.fit(X_train, y_train)
accuracy, cnf_matrix  = model_evaluation(dummy_clf, X_train, y_train)
print(accuracy)
print(cnf_matrix)
temp = store_results("Dummy classifier", y_train, X_train, y_test, X_test, dummy_clf, 5)
outcome = pd.concat([outcome,temp])
outcome

0.8300679923081978
[[1174557       0]
 [ 240456       0]]
TP value: 0
FP value: 0


  recall_test = TP / (TP+FP)
  recall_train = TP / (TP+FP)


Unnamed: 0,Model,Accuracy_train,recall_train,precision_train,Accuracy_test,recall_test,precision_test,CrossVal_Mean,CrossVal1,CrossVal2,CrossVal3,CrossVal4,CrossVal5
0,Dummy classifier,0.830068,,0.0,0.830068,,0.0,0.830068,0.830069,0.830069,0.830065,0.830068,0.830068


AttributeError: 'DataFrame' object has no attribute 'values_count'