In [1]:
import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt

from IPython.display import Markdown, display
def printmd(string):
    '''
    helper print function that prints markdown
    '''
    display(Markdown(string))

In [2]:
import shared

X, Y, X_labels, Y_labels = shared.fetch_data()

Connected to DB


In [6]:
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score


def pipeline_with_classifier(classifier):
    estimators = []
    estimators.append(('standardize', sklearn.preprocessing.StandardScaler(copy=True)))
    estimators.append(('forest', classifier))
    pipeline = Pipeline(estimators)

    return pipeline

seed = 7
np.random.seed(seed)

kfold = sklearn.cross_validation.KFold(n=len(X), n_folds=10, random_state=seed)

classifiers = [
    ('DecisionTreeClassifier', DecisionTreeClassifier()),
]

In [4]:
def test_target_binary_split(index):
    Y_target = np.copy(Y[:, index])
    half = np.percentile(Y_target, 50)
    printmd("**Half is splitted by value**: {}".format(half))

    sum = 0
    for i, e in enumerate(Y_target):
        if e >= half:
            sum += 1
            Y_target[i] = 1
        else:
            Y_target[i] = 0

    print('there are two classes: good with {} memebers and bad with {} members'.format(sum, Y.shape[0] - sum))
    print("Classes ratio: %.2f%%" % (sum / Y.shape[0] * 100))

    for clf in classifiers:
        printmd('## Calculating - {}'.format(clf[0]))
        pipeline = pipeline_with_classifier(clf[1])
        results = sklearn.cross_validation.cross_val_score(pipeline, X, Y_target, cv=kfold)
        printmd("Accuracy: %.2f%%" % (results.mean() * 100))
    
    del Y_target
        
for i, e in enumerate(Y_labels):
    printmd("# Using {} as target".format(e))
    test_target_binary_split(i)

# Using stars as target

**Half is splitted by value**: 1.0

there are two classes: good with 80369 memebers and bad with 59119 members
Classes ratio: 57.62%


## Calculating - DecisionTreeClassifier

Accuracy: 62.42%

# Using forks as target

**Half is splitted by value**: 0.0

there are two classes: good with 139488 memebers and bad with 0 members
Classes ratio: 100.00%


## Calculating - DecisionTreeClassifier

Accuracy: 100.00%

# Using subscribers as target

**Half is splitted by value**: 1.0

there are two classes: good with 132780 memebers and bad with 6708 members
Classes ratio: 95.19%


## Calculating - DecisionTreeClassifier

Accuracy: 93.53%

# Using downloads as target

**Half is splitted by value**: 911.0

there are two classes: good with 69751 memebers and bad with 69737 members
Classes ratio: 50.01%


## Calculating - DecisionTreeClassifier

Accuracy: 64.41%

# Using avg_per_month as target

**Half is splitted by value**: 73.0

there are two classes: good with 69789 memebers and bad with 69699 members
Classes ratio: 50.03%


## Calculating - DecisionTreeClassifier

Accuracy: 63.76%

# Using last_month_downloads as target

**Half is splitted by value**: 42.0

there are two classes: good with 70628 memebers and bad with 68860 members
Classes ratio: 50.63%


## Calculating - DecisionTreeClassifier

Accuracy: 62.92%

# Using last_week_downloads as target

**Half is splitted by value**: 10.0

there are two classes: good with 72320 memebers and bad with 67168 members
Classes ratio: 51.85%


## Calculating - DecisionTreeClassifier

Accuracy: 61.75%

In [7]:

def test_target_quaternary_split(index):
    Y_target = np.copy(Y[:, index])
    one_fourth = np.percentile(Y_target, 25)
    one_half = np.percentile(Y_target, 50)
    three_forths = np.percentile(Y_target, 75)

    print("1/4: {}, 1/2: {}, 3/4: {}".format(one_fourth, one_half, three_forths))
    sum = [0,0,0,0]
    for i, e in enumerate(Y_target):
        if e >= three_forths:
            e = 3
            sum[3] += 1
            continue

        if e > one_half:
            e = 2
            sum[2] += 1
            continue

        if e > one_fourth:
            e = 1
            sum[1] += 1
            continue


        e = 0
        sum[0] += 1

    print("Classes 1 ratio: %.2f%%" % (sum[0] / Y.shape[0] * 100))
    print("Classes 2 ratio: %.2f%%" % (sum[1] / Y.shape[0] * 100))
    print("Classes 3 ratio: %.2f%%" % (sum[2] / Y.shape[0] * 100))
    print("Classes 4 ratio: %.2f%%" % (sum[3] / Y.shape[0] * 100))

    for clf in classifiers:
        printmd('## Calculating - {}'.format(clf[0]))
        pipeline = pipeline_with_classifier(clf[1])
        results = sklearn.cross_validation.cross_val_score(pipeline, X, Y_target, cv=kfold)
        printmd("Accuracy: %.2f%%" % (results.mean() * 100))
    
    del Y_target

    
for i, e in enumerate(Y_labels):
    printmd("# Using {} as target".format(e))
    test_target_quaternary_split(i)

# Using stars as target

1/4: 0.0, 1/2: 1.0, 3/4: 5.0
Classes 1 ratio: 42.38%
Classes 2 ratio: 15.78%
Classes 3 ratio: 16.80%
Classes 4 ratio: 25.04%


## Calculating - DecisionTreeClassifier

Accuracy: 37.00%

# Using forks as target

1/4: 0.0, 1/2: 0.0, 3/4: 1.0
Classes 1 ratio: 62.40%
Classes 2 ratio: 0.00%
Classes 3 ratio: 0.00%
Classes 4 ratio: 37.60%


## Calculating - DecisionTreeClassifier

Accuracy: 57.44%

# Using subscribers as target

1/4: 1.0, 1/2: 1.0, 3/4: 2.0
Classes 1 ratio: 62.07%
Classes 2 ratio: 0.00%
Classes 3 ratio: 0.00%
Classes 4 ratio: 37.93%


## Calculating - DecisionTreeClassifier

Accuracy: 51.04%

# Using downloads as target

1/4: 353.0, 1/2: 911.0, 3/4: 2806.0
Classes 1 ratio: 25.01%
Classes 2 ratio: 25.01%
Classes 3 ratio: 24.97%
Classes 4 ratio: 25.01%


## Calculating - DecisionTreeClassifier

Accuracy: 0.44%

# Using avg_per_month as target

1/4: 32.45, 1/2: 73.0, 3/4: 202.5513888888889
Classes 1 ratio: 25.01%
Classes 2 ratio: 25.01%
Classes 3 ratio: 24.99%
Classes 4 ratio: 25.00%


## Calculating - DecisionTreeClassifier

ValueError: Unknown label type: array([[   34.5       ],
       [   55.        ],
       [   59.625     ],
       ..., 
       [   78.73684211],
       [ 2403.07142857],
       [ 5748.7       ]])

In [8]:
# Forks were not correctly split because quartiles didn't really find even split.
# We are going to attempt to split the classes using forks manually

Y_target = np.copy(Y[:, FORKS])
half = 2
printmd("**Half is splitted by value**: {}".format(half))

sum = 0
for i, e in enumerate(Y_target):
    if e >= half:
        sum += 1
        Y_target[i] = 1
    else:
        Y_target[i] = 0

print('there are two classes: good with {} memebers and bad with {} members'.format(sum, Y.shape[0] - sum))
print("Classes ratio: %.2f%%" % (sum / Y.shape[0] * 100))

for clf in classifiers:
    printmd('## Calculating - {}'.format(clf[0]))
    pipeline = pipeline_with_classifier(clf[1])
    results = sklearn.cross_validation.cross_val_score(pipeline, X, Y_target, cv=kfold)
    printmd("Accuracy: %.2f%%" % (results.mean() * 100))



NameError: name 'FORKS' is not defined