In [61]:
from string import punctuation

import numpy as np

# !!! MAKE SURE TO USE SVC.decision_function(X), NOT SVC.predict(X) !!!
# (this makes ``continuous-valued'' predictions)
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [2]:
######################################################################
# functions -- input/output
######################################################################

def read_vector_file(fname):
    """
    Reads and returns a vector from a file.
    
    Parameters
    --------------------
        fname  -- string, filename
        
    Returns
    --------------------
        labels -- numpy array of shape (n,)
                    n is the number of non-blank lines in the text file
    """
    return np.genfromtxt(fname)


def write_label_answer(vec, outfile):
    """
    Writes your label vector to the given file.
    
    Parameters
    --------------------
        vec     -- numpy array of shape (n,) or (n,1), predicted scores
        outfile -- string, output filename
    """
    
    # for this project, you should predict 70 labels
    if(vec.shape[0] != 70):
        print("Error - output vector should have 70 rows.")
        print("Aborting write.")
        return
    
    np.savetxt(outfile, vec)    



In [18]:
######################################################################
# functions -- feature extraction
######################################################################

def extract_words(input_string):
    """
    Processes the input_string, separating it into "words" based on the presence
    of spaces, and separating punctuation marks into their own words.
    
    Parameters
    --------------------
        input_string -- string of characters
    
    Returns
    --------------------
        words        -- list of lowercase "words"
    """
    
    for c in punctuation :
        input_string = input_string.replace(c, ' ' + c + ' ')
    return input_string.lower().split()


def extract_dictionary(infile):
    """
    Given a filename, reads the text file and builds a dictionary of unique
    words/punctuations.
    
    Parameters
    --------------------
        infile    -- string, filename
    
    Returns
    --------------------
        word_list -- dictionary, (key, value) pairs are (word, index)
    """
    
    word_list = {}
    with open(infile, 'r') as fid:
        ### ========== TODO : START ========== ###
        # part 1a: process each line to populate word_list
        i = 0
        for line in fid:
            new_words = extract_words(line)
            for words in new_words:
                if words in word_list:
                    continue
                else:
                    word_list[words] = i
                    i += 1
        ### ========== TODO : END ========== ###

    return word_list


In [65]:
def extract_feature_vectors(infile, word_list):
    """
    Produces a bag-of-words representation of a text file specified by the
    filename infile based on the dictionary word_list.
    
    Parameters
    --------------------
        infile         -- string, filename
        word_list      -- dictionary, (key, value) pairs are (word, index)
    
    Returns
    --------------------
        feature_matrix -- numpy array of shape (n,d)
                          boolean (0,1) array indicating word presence in a string
                            n is the number of non-blank lines in the text file
                            d is the number of unique words in the text file
    """
    
    num_lines = sum(1 for line in open(infile,'U'))
    num_words = len(word_list)
    feature_matrix = np.zeros((num_lines, num_words))
    
    with open(infile, 'r') as fid:
        ### ========== TODO : START ========== ###
        # part 1b: process each line to populate feature_matrix
        row = 0
        for line in fid:
            col = 0
            new_words = extract_words(line)
            for words in new_words:
                if words in word_list:
                    feature_matrix[row,word_list[words]] = 1
                else:
                    pass
            row += 1
        ### ========== TODO : END ========== ###
        
    return feature_matrix


In [66]:
######################################################################
# functions -- evaluation
######################################################################

def performance(y_true, y_pred, metric="accuracy"):
    """
    Calculates the performance metric based on the agreement between the 
    true labels and the predicted labels.
    
    Parameters
    --------------------
        y_true -- numpy array of shape (n,), known labels
        y_pred -- numpy array of shape (n,), (continuous-valued) predictions
        metric -- string, option used to select the performance measure
                  options: 'accuracy', 'f1-score', 'auroc', 'precision',
                           'sensitivity', 'specificity'        
    
    Returns
    --------------------
        score  -- float, performance score
    """
    # map continuous-valued predictions to binary labels
    y_label = np.sign(y_pred)
    y_label[y_label==0] = 1
    
    ### ========== TODO : START ========== ###
    # part 2a: compute classifier performance
    if metric == 'accuracy':
        return metrics.accuracy_score(y_true, y_label)
    if metric == 'f1_score':
        return metrics.f1_score(y_true, y_label)
    if metric == 'auroc':
        return metrics.roc_auc_score(y_true, y_label)
    if metric == 'precision':
        return metrics.precision_score(y_true, y_label)
    else:
        tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_label).ravel()
        if metric == 'sensitivity':
            sensitivity = tp/(tp + fn)
            return sensitivity
        if metric == 'specificity':
            specificity = tn/(fp + tn)
            return specificity
    ### ========== TODO : END ========== ###


def cv_performance(clf, X, y, kf, metric="accuracy"):
    """
    Splits the data, X and y, into k-folds and runs k-fold cross-validation.
    Trains classifier on k-1 folds and tests on the remaining fold.
    Calculates the k-fold cross-validation performance metric for classifier
    by averaging the performance across folds.
    
    Parameters
    --------------------
        clf    -- classifier (instance of SVC)
        X      -- numpy array of shape (n,d), feature vectors
                    n = number of examples
                    d = number of features
        y      -- numpy array of shape (n,), binary labels {1,-1}
        kf     -- cross_validation.KFold or cross_validation.StratifiedKFold
        metric -- string, option used to select performance measure
    
    Returns
    --------------------
        score   -- float, average cross-validation performance across k folds
    """
    
    ### ========== TODO : START ========== ###
    # part 2b: compute average cross-validation performance    
    mean = 0
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        y_pred = clf.decision_function(X_test)
        mean += performance(y_test,y_pred,metric)
    
    return mean/5
    ### ========== TODO : END ========== ###


def select_param_linear(X, y, kf, metric="accuracy"):
    """
    Sweeps different settings for the hyperparameter of a linear-kernel SVM,
    calculating the k-fold CV performance for each setting, then selecting the
    hyperparameter that 'maximize' the average k-fold CV performance.
    
    Parameters
    --------------------
        X      -- numpy array of shape (n,d), feature vectors
                    n = number of examples
                    d = number of features
        y      -- numpy array of shape (n,), binary labels {1,-1}
        kf     -- cross_validation.KFold or cross_validation.StratifiedKFold
        metric -- string, option used to select performance measure
    
    Returns
    --------------------
        C -- float, optimal parameter value for linear-kernel SVM
    """
    
    print('Linear SVM Hyperparameter Selection based on ', str(metric), ':')
    max_score = 0
    
    C_range = 10.0 ** np.arange(-3, 3)
    c = 0
    for i in np.nditer(C_range):
        clf = SVC(kernel='linear', C=i)
        score = cv_performance(clf, X, y, kf, metric)
        print('C = ', i, ' score = ', round(score, 4))
        if score > max_score:
            c = i
            max_score = score
    ### ========== TODO : START ========== ###
    # part 2c: select optimal hyperparameter using cross-validation
    return c
    ### ========== TODO : END ========== ###


def select_param_rbf(X, y, kf, metric="accuracy"):
    """
    Sweeps different settings for the hyperparameters of an RBF-kernel SVM,
    calculating the k-fold CV performance for each setting, then selecting the
    hyperparameters that 'maximize' the average k-fold CV performance.
    
    Parameters
    --------------------
        X       -- numpy array of shape (n,d), feature vectors
                     n = number of examples
                     d = number of features
        y       -- numpy array of shape (n,), binary labels {1,-1}
        kf     -- cross_validation.KFold or cross_validation.StratifiedKFold
        metric  -- string, option used to select performance measure
    
    Returns
    --------------------
        gamma, C -- tuple of floats, optimal parameter values for an RBF-kernel SVM
    """
    
    print('RBF SVM Hyperparameter Selection based on ' + str(metric) + ':')
    
    ### ========== TODO : START ========== ###
    # part 3b: create grid, then select optimal hyperparameters using cross-validation
    C_range = 10.0 ** np.arange(-3, 3)
    g_range = 10.0 ** np.arange(-3, 3)
    c = 0
    g = 0
    for i in np.nditer(C_range):
        for j in np.nditer(g_range):
            clf = SVC(kernel='rbf', C=i, gamma=j)
            score = cv_performance(clf, X, y, kf, metric)
            print('C = ', i, ' score = ', round(score, 4))
            if score > max_score:
                c = i
                g = j
                max_score = score
    return c,g
    ### ========== TODO : END ========== ###


def performance_test(clf, X, y, metric="accuracy"):
    """
    Estimates the performance of the classifier using the 95% CI.
    
    Parameters
    --------------------
        clf          -- classifier (instance of SVC)
                          [already fit to data]
        X            -- numpy array of shape (n,d), feature vectors of test set
                          n = number of examples
                          d = number of features
        y            -- numpy array of shape (n,), binary labels {1,-1} of test set
        metric       -- string, option used to select performance measure
    
    Returns
    --------------------
        score        -- float, classifier performance
    """

    ### ========== TODO : START ========== ###
    # part 4b: return performance on test data by first computing predictions and then calling performance

    score = 0        
    return score
    ### ========== TODO : END ========== ###


In [70]:
def main() :
    np.random.seed(1234)
    
    # read the tweets and its labels   
    dictionary = extract_dictionary('./tweets.txt')
    X = extract_feature_vectors('./tweets.txt', dictionary)
    y = read_vector_file('./labels.txt')
    
    metric_list = ["accuracy", "f1_score", "auroc", "precision", "sensitivity", "specificity"]
    
    ### ========== TODO : START ========== ###
    # part 1c: split data into training (training + cross-validation) and testing set
    train_features = X[:560,:]
    train_labels = y[:560]
    test_features = X[560:, :]
    test_labels = y[560:]
    c = [0]*6
    i = 0
    # part 2b: create stratified folds (5-fold CV)
    kf = StratifiedKFold(n_splits=5)
    # part 2d: for each metric, select optimal hyperparameter for linear-kernel SVM using CV
    for metric in metric_list:
        c[i] = select_param_linear(train_features, train_labels, kf, metric)
        i += 1
    # part 3c: for each metric, select optimal hyperparameter for RBF-SVM using CV
    for j in range(len(c)):
        print(c[j])
    # part 4a: train linear- and RBF-kernel SVMs with selected hyperparameters
    
    # part 4c: report performance on test data
    
    ### ========== TODO : END ========== ###
    
    
if __name__ == "__main__" :
    main()




Linear SVM Hyperparameter Selection based on  accuracy :
C =  0.001  score =  0.7089
C =  0.01  score =  0.7107
C =  0.1  score =  0.806
C =  1.0  score =  0.8146
C =  10.0  score =  0.8182
C =  100.0  score =  0.8182
Linear SVM Hyperparameter Selection based on  f1_score :
C =  0.001  score =  0.8297
C =  0.01  score =  0.8306
C =  0.1  score =  0.8755
C =  1.0  score =  0.8749
C =  10.0  score =  0.8766
C =  100.0  score =  0.8766
Linear SVM Hyperparameter Selection based on  auroc :
C =  0.001  score =  0.5
C =  0.01  score =  0.5031
C =  0.1  score =  0.7188
C =  1.0  score =  0.7531
C =  10.0  score =  0.7592
C =  100.0  score =  0.7592
Linear SVM Hyperparameter Selection based on  precision :
C =  0.001  score =  0.7089
C =  0.01  score =  0.7102
C =  0.1  score =  0.8357
C =  1.0  score =  0.8562
C =  10.0  score =  0.8595
C =  100.0  score =  0.8595
Linear SVM Hyperparameter Selection based on  sensitivity :
C =  0.001  score =  1.0
C =  0.01  score =  1.0
C =  0.1  score =  0.

In [28]:
word_list = extract_dictionary('./tweets.txt')
features = extract_feature_vectors('./tweets.txt', word_list)



In [32]:
z = features[:560, :]

In [52]:
z = np.zeros([3,3])

In [53]:
z[2,1] = 1 

NameError: name 'c' is not defined