In [130]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import mixture
from sklearn.neighbors.kde import KernelDensity
from sklearn.cluster import KMeans
from sklearn import svm

def kMeans(X, classes):  
    numClasses = len(classes)
    kmeans = KMeans(n_clusters=numClasses).fit(X)
    
    return kmeans


def svm(X, y):
    clf = svm.SVC()
    clf.fit(X, y)
    
    svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False)
    
    return clf
    

def gmm(points):
    clf = mixture.GaussianMixture(n_components=6, covariance_type='full')
    pdfs = clf.fit(points).score_samples(points)
        
    return pdfs

def loadDensitiesByClass(instances, indexesByClass):
    pdfs = [None] * len(instances)
    for c, indexes in indexesByClass.items():
        points = instances[indexes]
        pdfsByClass = gmm(points)
        a = 0
        for i in indexes:
            pdfs[i]=pdfsByClass[a]
            a+=1
        
    return pdfs  


def kde(points):
    kernel = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(points)
    pdfs = kernel.score_samples(points)
    
    return pdfs


def baseClassifier(instancesToPredict, classifier):
    return classifier.predict(instancesToPredict)


#Slicing instances according to their inferred clusters
def slicingClusteredData(clusters, classes):
    indexes = {}
    for c in range(numClasses):
        indexes[classes[c]]=[i for i in range(len(clusters)) if clusters[i] == c]
    
    return indexes


#Cutting data for next iteration
def compactingDataDensityBased(instances, densities, criteria):
    maxPDF = max(densities)*criteria
    selectedInstances = [instances[i] for i in range(len(densities)) if densities[i] >= maxPDF]
    return selectedInstances
    
    
def main():
    #current directory
    path = os.getcwd() 

    '''
    Reading NOAA dataset:
    Eight  features  (average temperature, minimum temperature, maximum temperature, dew
    point,  sea  level  pressure,  visibility,  average wind speed, maximum  wind  speed)
    are  used  to  determine  whether  each  day  experienced  rain  or no rain.
    '''
    dataValues = pd.read_csv(path+'\\noaa_data.csv',sep = ",")
    dataLabels = pd.read_csv(path+'\\noaa_label.csv',sep = ",")


    #Test 0: Predicting 10 instances. Starting labeled data with 5%
    initialDataLength = round((0.001)*len(dataValues))
    U = dataValues.loc[initialDataLength:(initialDataLength+10)].copy()
    U = U.values

    # ***** Box 0 *****
    X = dataValues.loc[:initialDataLength].copy()
    X = X.values
    
    #Starting the process
    for t in range(len(U)):
        print("Step ",t)
        print("Length: ", len(X))
        print("Selected data: ", X)
       
        
        # ***** Box 1 *****
        Ut = U[t]
        print("Selected unlabeled data: ", Ut)
        classes=[0, 1]

        # ***** Box 2 *****
        kmeans = kMeans(X, classes)
        clusters = kmeans.labels_
        predicted = baseClassifier(Ut, kmeans)

        indexesByClass = slicingClusteredData(np.hstack([clusters, predicted]), classes)
        instances = np.vstack([X, Ut])
        
        # ***** Box 3 *****
        #Testing with two different methods
        #pdfGmm = loadDensitiesByClass(instances, indexesByClass)
        pdfKde = loadDensitiesByClass(instances, indexesByClass)
        
        # ***** Box 4 *****
        #instancesGMM = compactingDataDensityBased(instances, pdfGmm, 0.8)
        instancesKDE = compactingDataDensityBased(instances, pdfKde, 0.8)
        
        # ***** Box 5 *****
        #X = instancesGMM
        X = instancesKDE
        
        
        
main()

Step  0
Length:  19
Selected data:  [[  1.98000000e+01   1.40000000e+01   1.01960000e+03   8.40000000e+00
    9.90000000e+00   1.59000000e+01   2.89000000e+01   1.40000000e+01]
 [  2.68000000e+01   2.22000000e+01   1.00620000e+03   8.10000000e+00
    1.09000000e+01   1.90000000e+01   3.40000000e+01   2.10000000e+01]
 [  3.46000000e+01   3.29000000e+01   1.00460000e+03   3.90000000e+00
    1.38000000e+01   2.20000000e+01   3.60000000e+01   3.31000000e+01]
 [  2.64000000e+01   2.15000000e+01   1.00690000e+03   8.00000000e+00
    1.87000000e+01   3.01000000e+01   3.99000000e+01   1.60000000e+01]
 [  1.47000000e+01   7.90000000e+00   1.00990000e+03   8.10000000e+00
    1.41000000e+01   2.20000000e+01   2.10000000e+01   9.00000000e+00]
 [  3.06000000e+01   2.27000000e+01   1.01510000e+03   1.26000000e+01
    9.30000000e+00   1.71000000e+01   4.21000000e+01   1.90000000e+01]
 [  3.72000000e+01   3.02000000e+01   1.01220000e+03   1.07000000e+01
    8.10000000e+00   1.71000000e+01   5.00000000



 19
Selected data:  [array([   26.8,    22.2,  1006.2,     8.1,    10.9,    19. ,    34. ,
          21. ]), array([   34.6,    32.9,  1004.6,     3.9,    13.8,    22. ,    36. ,
          33.1]), array([   26.4,    21.5,  1006.9,     8. ,    18.7,    30.1,    39.9,
          16. ]), array([   14.7,     7.9,  1009.9,     8.1,    14.1,    22. ,    21. ,
           9. ]), array([   30.6,    22.7,  1015.1,    12.6,     9.3,    17.1,    42.1,
          19. ]), array([   37.2,    30.2,  1012.2,    10.7,     8.1,    17.1,    50. ,
          30.9]), array([   36.3,    30.1,  1002.8,     9.7,     6. ,    17.1,    43. ,
          30.9]), array([   14.9,    10.8,  1025.6,     8. ,    19.8,    26. ,    32. ,
           7. ]), array([   10.4,     5.3,  1039.9,     4.8,    14.7,    19. ,    12.9,
           9. ]), array([   15.3,    12. ,  1038.9,     2.6,     9.2,    10.1,    19.9,
           9. ]), array([   17.8,    11.9,  1030.4,     9.2,     6.6,    10.1,    24.1,
          10. ]), array([   2



C:\Users\Convidado\Documents\Python Scripts
