In [1]:
import csv

DATASET = []
CLASSES = []

with open('transfusion.data.txt', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader, None)
    for row in reader:
        values = [float(val) for val in row]
        DATASET.append(values[:-1])
        CLASSES.append(values[-1])
        

In [2]:
# Load libs

import numpy as np

In [3]:
categories_dataset = np.matrix(DATASET)
categories_dataset = np.delete(categories_dataset, 2, 1) # exclude monetary column
categories_dataset = categories_dataset.T.tolist()
column_count = len(categories_dataset)

CATEGORIES = [[-1]*column_count for row in DATASET]
CATEGORY_SIZE = 10

for i in range(len(categories_dataset)):
    row = categories_dataset[i]
    min_val = min(row)
    max_val = max(row)
    interval_step = (max_val - min_val)/CATEGORY_SIZE
    
    for k in range(len(row)):
        CATEGORIES[k][i] = (row[k] - min_val)//interval_step
        if CATEGORIES[k][i] == CATEGORY_SIZE:
            CATEGORIES[k][i] -= 1
    

In [4]:
from collections import defaultdict

COLORS = []
SIZES = []

CATEGORY_CLASS_MAP = defaultdict(lambda: [0, 0])
for i in range(len(CATEGORIES)):
    row = CATEGORIES[i]
    data_class = CLASSES[i]
    key = ''.join(map(str, map(int, row)))
    CATEGORY_CLASS_MAP[key][int(data_class)] += 1

for row in CATEGORIES:
    key = ''.join(map(str, map(int, row)))
    data = CATEGORY_CLASS_MAP[key]
    items_count = sum(data)
    SIZES.append(items_count)
    COLORS.append(data[1]/items_count)

In [5]:
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)

def fig_3d(x, y, z, sizes, colors, legend):
    x = [val[0] for val in x]
    y = [val[0] for val in y]
    z = [val[0] for val in z]
    
    
    surface = Scatter3d(
        x=x,
        y=y,
        z=z,
        mode='markers',
        marker=dict(
            size=sizes,
            sizemode='area',
            sizeref=2.*max(sizes)/(40.**2),
            sizemin=20,
            color=colors,
            colorscale='Jet', 
            opacity=0.8
        )
    )
    data = Data([surface])
    layout = Layout(
        autosize=True,
        title='Parametric Plot',
        scene=Scene(
            xaxis=XAxis(
                title=legend[0],
                gridcolor='rgb(255, 255, 255)',
                zerolinecolor='rgb(255, 255, 255)',
                showbackground=True,
                backgroundcolor='rgb(230, 230,230)',
            ),
            yaxis=YAxis(
                title=legend[1],
                gridcolor='rgb(255, 255, 255)',
                zerolinecolor='rgb(255, 255, 255)',
                showbackground=True,
                backgroundcolor='rgb(230, 230,230)'
            ),
            zaxis=ZAxis(
                title=legend[2],
                gridcolor='rgb(255, 255, 255)',
                zerolinecolor='rgb(255, 255, 255)',
                showbackground=True,
                backgroundcolor='rgb(230, 230,230)'
            )
        )
    )

    fig = Figure(data=data, layout=layout)
    iplot(fig)
    
temp_cats = np.matrix(CATEGORIES)
fig_3d(temp_cats[:, 0].tolist(), temp_cats[:, 1].tolist(), temp_cats[:, 2].tolist(), 
       SIZES, COLORS, legend=['Recency', 'Frequency', 'Time'])

In [6]:
# Calculate components

data_matrix = np.matrix(DATASET)
data_matrix = data_matrix.T
means = data_matrix.mean(axis=1)
data_matrix -= means.reshape(len(data_matrix), 1)
data_matrix = data_matrix.T

COMPONENTS_COUNT = 2
cov_matrix = np.cov(data_matrix.T)
values, vecs = np.linalg.eig(cov_matrix)
vecs = vecs[:COMPONENTS_COUNT]

# Create projections
projected_data = np.dot(data_matrix, vecs.T)
projected_data = projected_data.T
max_values = projected_data.max(axis=1).reshape(len(projected_data), 1)
min_values = projected_data.min(axis=1).reshape(len(projected_data), 1)
projected_data = (projected_data - min_values) / (max_values - min_values)
projected_data = projected_data.T

In [7]:
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
init_notebook_mode(connected=True)

def fig_2d(x, y):
    x = [val[0] for val in x]
    y = [val[0] for val in y]

    surface = Scatter(
        x=x,
        y=y,
        mode='markers',
        marker=dict(
            size=6,
            opacity=0.8
        )
    )
    data = Data([surface])
    layout = Layout(
        autosize=True,
        title='Parametric Plot',
        scene=Scene(
            xaxis=XAxis(
                gridcolor='rgb(255, 255, 255)',
                zerolinecolor='rgb(255, 255, 255)',
                showbackground=True,
                backgroundcolor='rgb(230, 230,230)'
            ),
            yaxis=YAxis(
                gridcolor='rgb(255, 255, 255)',
                zerolinecolor='rgb(255, 255, 255)',
                showbackground=True,
                backgroundcolor='rgb(230, 230,230)'
            ),
            zaxis=ZAxis(
                gridcolor='rgb(255, 255, 255)',
                zerolinecolor='rgb(255, 255, 255)',
                showbackground=True,
                backgroundcolor='rgb(230, 230,230)'
            )
        )
    )

    fig = Figure(data=data, layout=layout)
    iplot(fig)
    


fig_2d(projected_data[:, 0].tolist(), projected_data[:, 1].tolist())


In [8]:
# Cross validation

from random import random, shuffle
from math import floor

TRAINSET_SIZE_MIN = 0.4
TRAINSET_SIZE_MAX = 0.7

def generate_test_and_train_sets(classed_dataset):
    
    train_dataset = []
    test_dataset = []
    trainset_percent_size = TRAINSET_SIZE_MIN + (TRAINSET_SIZE_MAX-TRAINSET_SIZE_MIN)*random()
    
    for dataset in classed_dataset.values():
        dataset_size = len(dataset)
        shuffle(dataset)

        trainset_size = floor(dataset_size*trainset_percent_size)
        train_dataset.extend(dataset[:trainset_size]) 
        test_dataset.extend(dataset[trainset_size+1:])
    
    return train_dataset, test_dataset
    
    
    


In [9]:
# Classifier processor

from collections import defaultdict
from sklearn import metrics

def train_classifier(classifier, data, classes):
    epoch = 0
    EPOCH_COUNT_MAX = 1000
    
    classified_dataset = defaultdict(list)
    temp_dataset = [pair for pair in zip(data, classes)]
    for pair in temp_dataset:
        classified_dataset[pair[1]].append(pair)
        
    metrics_names = ['f1_score', 'average_precision_score', 'accuracy_score', 'recall_score']
    proba_metrics_names = ['roc_auc_score',]
    metrics_values = {name: 0 for name in metrics_names}
    metrics_values.update({name: 0 for name in proba_metrics_names})
    
    has_proba = hasattr(classifier, 'predict_proba')

    for epoch in range(EPOCH_COUNT_MAX):
        train_dataset, test_dataset = generate_test_and_train_sets(classified_dataset)

        train_x = []
        train_y = []

        for pair in train_dataset:
            train_x.append(pair[0])
            train_y.append(pair[1])

        test_x = []
        test_y = []

        for pair in test_dataset:
            test_x.append(pair[0])
            test_y.append(pair[1])

        classifier.fit(train_x, train_y)
        predict = classifier.predict(test_x)
        
        predict_proba = None
        if has_proba:
            predict_proba = classifier.predict_proba(test_x)
            predict_proba = predict_proba[:, 1]
        
        for name in metrics_names:
            metric_func = getattr(metrics, name)
            metrics_values[name] += metric_func(test_y, predict)
            
        if has_proba:
            for name in proba_metrics_names:
                metric_func = getattr(metrics, name)
                metrics_values[name] += metric_func(test_y, predict_proba)
            
    for name in metrics_names:
        metrics_values[name] /= epoch
        print('Metric %s: %s'%(name, metrics_values[name]))
      
    if has_proba:
        for name in proba_metrics_names:
            metrics_values[name] /= epoch
            print('Metric %s: %s'%(name, metrics_values[name]))


In [10]:
from collections import defaultdict

COLORS = ['red', 'green', 'blue', 'black', 'orange']

def classificate_and_display(classifier, dataset, class_count):
    x, y = dataset[:, 0].tolist(), dataset[:, 1].tolist()
    x = [val[0] for val in x]
    y = [val[0] for val in y]
    
    predict_result = classifier.predict(dataset)

    scatters_list = []
    classified_results = defaultdict(lambda: {'x':[], 'y':[]})
    
    index = 0
    for class_number in predict_result:
        class_number = int(class_number)
        classified_results[class_number]['x'].append(x[index])
        classified_results[class_number]['y'].append(y[index])
        index += 1
        
    for class_number, data in classified_results.items():
        scatters_list.append(
            Scatter(
                x=data['x'],
                y=data['y'],
                mode='markers',
                marker=dict(
                    size=6,
                    opacity=0.8,
                    color=COLORS[class_number]
                )
            )
        )
        
    data = Data(scatters_list)
    layout = Layout(
        autosize=True,
        title='Parametric Plot',
        scene=Scene(
            xaxis=XAxis(
                gridcolor='rgb(255, 255, 255)',
                zerolinecolor='rgb(255, 255, 255)',
                showbackground=True,
                backgroundcolor='rgb(230, 230,230)'
            ),
            yaxis=YAxis(
                gridcolor='rgb(255, 255, 255)',
                zerolinecolor='rgb(255, 255, 255)',
                showbackground=True,
                backgroundcolor='rgb(230, 230,230)'
            ),
            zaxis=ZAxis(
                gridcolor='rgb(255, 255, 255)',
                zerolinecolor='rgb(255, 255, 255)',
                showbackground=True,
                backgroundcolor='rgb(230, 230,230)'
            )
        )
    )

    fig = Figure(data=data, layout=layout)
    iplot(fig)    

In [30]:
dataset = np.matrix(DATASET)
#dataset = np.concatenate((dataset, np.matrix(CATEGORIES)), axis=1)

In [22]:
# KNN classifier
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)

train_classifier(classifier, dataset.tolist(), CLASSES)
classificate_and_display(classifier, dataset, 2)
    

Metric f1_score: 0.329206917032
Metric average_precision_score: 0.296330536241
Metric accuracy_score: 0.75114337022
Metric recall_score: 0.260385851259
Metric roc_auc_score: 0.661738747275


In [24]:
# SVM classifier
from sklearn.svm import LinearSVC
classifier = LinearSVC()

train_classifier(classifier, dataset.tolist(), CLASSES)
classificate_and_display(classifier, dataset, 2)

Metric f1_score: 0.128195734655
Metric average_precision_score: 0.253159661086
Metric accuracy_score: 0.686937431219
Metric recall_score: 0.231603915974


In [31]:
# Logistic regression classifier
from sklearn import linear_model
classifier = linear_model.LogisticRegression(C=1e5)

train_classifier(classifier, dataset.tolist(), CLASSES)
classificate_and_display(classifier, dataset, 2)

Metric f1_score: 0.21839941393
Metric average_precision_score: 0.290153547675
Metric accuracy_score: 0.775462914377
Metric recall_score: 0.135001084739
Metric roc_auc_score: 0.750730680551
