In [None]:
import os
import wget
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate as cv
import sklearn.preprocessing as pp
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, auc, accuracy_score, confusion_matrix
from sklearn.cluster import KMeans, SpectralClustering
from statistics import mode
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

# Banknote Atuthentication
## Passive / Active Learning

In [None]:
if('bank.csv' in os.listdir() == False) :
    url2 = "https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt"
    wget.download(url2, "bank.csv")
else:
    pass


bank_data = pd.read_csv("bank.csv", header = None)
cols_bank = ['var', 'skew', 'curtosis', 'entropy', 'class']
bank_data.columns = cols_bank
bank_train = bank_data.sample(n = 900, random_state = 78)
bank_test = bank_data.drop(bank_train.index)

In [None]:
M2 = 50

passive = []



for i in range(M2):
   
    bank_pool_passive = bank_train.copy()
    bank_use_passive = pd.DataFrame()
    
    df = pd.DataFrame(columns = ['training_instances', 'test_error', 'c'])
    
    while(bank_pool_passive.size > 0):
        flag = True
        while(flag):        #make sure each class has at least 1 instance
            bank_curr_passive = bank_pool_passive.sample(n = 10)
            if(bank_curr_passive[bank_curr_passive['class'] == 0].shape[0] == 5 and
               bank_curr_passive[bank_curr_passive['class'] == 1].shape[0] == 5):
               flag = False
            else:
                flag = True
        bank_pool_passive = bank_pool_passive.drop(index = bank_curr_passive.index.values)
        bank_use_passive = bank_use_passive.append(bank_curr_passive)
        
        #cross-validation for parameter
        best_c = None
        best_score = float('-inf')
        
        for c in range(-3, 3):
            svc_passive = LinearSVC(penalty = 'l1', dual = False, C = 10**c, random_state = i)
            score = np.mean(cross_val_score(cv = 5, estimator = svc_passive, X = bank_use_passive.iloc[:, :-1], y = bank_use_passive.iloc[:,-1], scoring = 'accuracy'))
            if(score > best_score):
                best_score = score
                best_c = c
              
        best_svc_passive = LinearSVC(penalty = 'l1', dual = False, C = 10**best_c, random_state = i)
        best_svc_passive.fit(bank_use_passive.iloc[:, : -1], bank_use_passive.iloc[:, -1])
        test_error_passive = (1 - best_svc_passive.score(bank_test.iloc[:, :-1], bank_test.iloc[:, -1]))
        df = df.append({'training_instances' : bank_use_passive.shape[0], 'test_error' : test_error_passive,
                        'c' : best_c}, ignore_index = True)
        
    passive.append(df)

In [None]:
active = []        

for j in range(M2):
   
    bank_pool_active = bank_train.copy()
    flag_active = True
    while(flag_active):
        bank_use_active = bank_pool_active.sample(n = 10)
        if(bank_use_active[bank_use_active['class'] == 0].shape[0] != 0 and
           bank_use_active[bank_use_active['class'] == 1].shape[0] != 0):
            flag_active = False
        else:
            flag_active = True
    bank_pool_active = bank_pool_active.drop(index = bank_use_active.index.values)
    
    best_c_active = None
    best_score_active = float('-inf')
    
    df_active = pd.DataFrame(columns = ['training_instances', 'test_error', 'c'])
    
    while(bank_pool_active.size > 0):
        for c_active in range(-3, 3):
            svc_active = LinearSVC(penalty = 'l1', dual = False, C = 10**c_active, random_state = j)
            score_active = np.mean(cross_val_score(cv = 5, estimator = svc_active, X = bank_use_active.iloc[:, :-1], y = bank_use_active.iloc[:, -1], scoring = 'accuracy'))
            if(score_active > best_score_active):
                best_score_active = score_active
                best_c_active = c
            
        best_svc_active = LinearSVC(penalty = 'l1', dual = False, C = 10**best_c_active, random_state = j)
        best_svc_active.fit(bank_use_active.iloc[:, :-1], bank_use_active.iloc[:,-1])
        test_error_active = (1 - best_svc_active.score(bank_test.iloc[:, :-1], bank_test.iloc[:, -1]))
        
        df_active = df_active.append({'training_instances' : bank_use_active.shape[0], 'test_error' : test_error_active,
                        'c' : best_c_active}, ignore_index = True)
    
        hyperplane = pd.Series(best_svc_active.decision_function(bank_pool_active.iloc[:,: -1]), index = bank_pool_active.index)
        w_norm = np.linalg.norm(best_svc_active.coef_)
        distance = hyperplane / w_norm
        closest_idx = np.abs(distance).sort_values()[:10].index.values
        
        bank_use_active = bank_use_active.append(bank_pool_active.loc[closest_idx])
        bank_pool_active = bank_pool_active.drop(index = closest_idx)
    
    active.append(df_active)

In [None]:
monte = pd.DataFrame()
for a in len(active):
    active[a]['test_error']

