In [20]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from string import punctuation, digits
import numpy as np
import random


# Classification

In [None]:
    """
    A classification function that uses theta and theta_0 to classify a set of
    data points.

    Args:
        feature_matrix - A numpy matrix describing the given data. Each row
            represents a single data point.
                theta - A numpy array describing the linear classifier.
        theta - A numpy array describing the linear classifier.
        theta_0 - A real valued number representing the offset parameter.

    Returns: A numpy array of 1s and -1s where the kth element of the array is
    the predicted classification of the kth row of the feature matrix using the
    given theta and theta_0. If a prediction is GREATER THAN zero, it should
    be considered a positive classification.
    ""

In [6]:
def classify(feature_matrix, theta, theta_0):
    
    X=feature_matrix
    w=theta
    w0=theta_0
    pred = []
        
    for i in range(len(feature_matrix)):
        
        if (np.dot(X[i], w)+w0)>0:
            pred.append(1)
        else:
            pred.append(-1)       
    return np.array(pred)    


In [4]:
def classify(feature_matrix, theta, theta_0):
    (nsamples, nfeatures) = feature_matrix.shape
    predictions = np.zeros(nsamples)
    for i in range(nsamples):
        feature_vector = feature_matrix[i]
        prediction = np.dot(theta, feature_vector) + theta_0
        if (prediction > 0):
            predictions[i] = 1
        else:
            predictions[i] = -1
    return predictions


In [3]:
def classify(feature_matrix, theta, theta_0):
    
    predictions=[]
    for i in range(len(feature_matrix)):
        
       if(np.dot(theta,feature_matrix[i].transpose())+theta_0)>0:
           
           predictions.append(1)
       else:
           predictions.append(-1)
           
    return np.array(predictions)

# Accuracy

In [8]:
"""
Given length-N vectors containing predicted and target labels,
returns the percentage and number of correct predictions.
"""

'\nGiven length-N vectors containing predicted and target labels,\nreturns the percentage and number of correct predictions.\n'

In [9]:
def accuracy(preds, targets):
    return (preds == targets).mean()
    
    

In [None]:
    """
    Trains a linear classifier and computes accuracy.
    The classifier is trained on the train data. The classifier's
    accuracy on the train and validation data is then returned.

    Args:
        classifier - A classifier function that takes arguments
            (feature matrix, labels, **kwargs) and returns (theta, theta_0)
        train_feature_matrix - A numpy matrix describing the training
            data. Each row represents a single data point.
        val_feature_matrix - A numpy matrix describing the validation
            data. Each row represents a single data point.
        train_labels - A numpy array where the kth element of the array
            is the correct classification of the kth row of the training
            feature matrix.
        val_labels - A numpy array where the kth element of the array
            is the correct classification of the kth row of the validation
            feature matrix.
        **kwargs - Additional named arguments to pass to the classifier
            (e.g. T or L)

    Returns: A tuple in which the first element is the (scalar) accuracy of the
    trained classifier on the training data and the second element is the
    accuracy of the trained classifier on the validation data.
    """

In [13]:
def perceptron_single_step_update(
        feature_vector,
        label,
        current_theta,
        current_theta_0):
    X=feature_vector
    Y=label
    w=current_theta
    w0=current_theta_0


    if (np.dot(X, w)+w0)*Y <= 0:
                w = w + X*Y
                w0 = w0 + Y
              

    return (w,w0)

In [14]:
def classifier(feature_matrix, labels, **kwargs):

    
    # Your code here
   
    X=feature_matrix
    Y=labels
    w = np.zeros(len(X[0]))
    
    #w = np.transpose([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    w0 = 0
    for t in range(**kwargs):
        for i in get_order(feature_matrix.shape[0]):
             w,w0=perceptron_single_step_update(X[i,:], Y[i], w, w0)
    return (w,w0)

In [16]:
def classifier_accuracy(
        classifier,
        train_feature_matrix,
        val_feature_matrix,
        train_labels,
        val_labels,
        **kwargs):
    
    X_train=train_feature_matrix
    X_test=val_feature_matrix
    y_train=train_labels
    y_test=val_labels
    
    w, w0 = classifier(X_train, y_train, **kwargs)
    pred_train=classify(X_train,w,w0)
    pred_val = classify(X_test, w, w0)
    
    acc_train=accuracy(pred_train,y_train)
    acc_test=accuracy(pred_val,y_test)
    
    return (acc_train,acc_test)

In [None]:
#ej1
#https://www.kaggle.com/code/jvdahemad/building-linear-classifier-models-from-scratch/notebook
def classifier_accuracy(classifier,train_feature_matrix,val_feature_matrix,train_labels,val_labels,**kwargs):

    thetas=classifier(train_feature_matrix,train_labels,**kwargs)
        
    predictions_train=classify(train_feature_matrix,thetas[0],thetas[1])
    predictions_val=classify(val_feature_matrix,thetas[0],thetas[1])
    
    train_acc=accuracy(predictions_train,train_labels)
    val_acc=accuracy(predictions_val,val_labels)
    
    return (train_acc,val_acc)

In [None]:
#ej2
#https://github.com/mmbajo/Machine-Learning-Perceptrons/blob/master/project1.py
def classifier_accuracy(classifier,train_feature_matrix,val_feature_matrix,train_labels,val_labels,**kwargs):

    theta, theta_0 = classifier(train_feature_matrix, train_labels, **kwargs)
    train_pred = classify(train_feature_matrix, theta, theta_0)
    train_accuracy = accuracy(train_pred, train_labels)
    val_pred = classify(val_feature_matrix, theta, theta_0)
    val_accuracy = accuracy(val_pred, val_labels)
    return (train_accuracy, val_accuracy)
    #raise NotImplementedError
#pragma: coderesponse end

# Baseline Accuracy


In [29]:
from string import punctuation, digits
import numpy as np
import random

def get_order(n_samples):
    try:
        with open(str(n_samples) + '.txt') as fp:
            line = fp.readline()
            return list(map(int, line.split(',')))
    except FileNotFoundError:
        random.seed(1)
        indices = list(range(n_samples))
        random.shuffle(indices)
        return indices


# def hinge_loss_single(feature_vector, label, theta, theta_0):

#     z = label*(np.sum(np.multiply(feature_vector,theta))+theta_0)
#     ret = max(0,1-z) 
#     return ret
#     raise NotImplementedError


# def hinge_loss_full(feature_matrix, labels, theta, theta_0):
#     return np.maximum(0, 1 - labels*(np.sum(feature_matrix * theta, axis = 1) + theta_0)).mean()


def perceptron_single_step_update(
        feature_vector,
        label,
        current_theta,
        current_theta_0):
    X=feature_vector
    Y=label
    w=current_theta
    w0=current_theta_0

    if (np.dot(X, w)+w0)*Y <= 0:
        w = w + X*Y
        w0 = w0 + Y              
    return (w,w0)



def perceptron(feature_matrix, labels, T):

    
    # Your code here
    X=feature_matrix
    Y=labels
    w = np.zeros(len(X[0]))
    w0 = 0
    for t in range(T):
        for i in get_order(feature_matrix.shape[0]):
             w,w0=perceptron_single_step_update(X[i,:], Y[i], w, w0)
    return (w,w0)

#             pass
#     raise NotImplementedError


def average_perceptron(feature_matrix, labels, T):

    X=feature_matrix
    Y=labels
    w = np.zeros(len(X[0]))
    w0 = 0
    wnew= np.zeros(len(X[0]))
    w0new=0
    counter=0
    for t in range(T):
        for i in get_order(feature_matrix.shape[0]):
            w,w0=perceptron_single_step_update(X[i,:], Y[i], w, w0)
            wnew= wnew + w
            w0new= w0new + w0
            counter= counter + 1           
    return (wnew/counter,w0new/counter)


def pegasos_single_step_update(
        feature_vector,
        label,
        L,
        eta,
        current_theta,
        current_theta_0):

    X=feature_vector
    Y=label
    w=current_theta
    w0=current_theta_0


    if (np.dot(X, w)+w0)*Y <= 1:
        w = w*(1-L*eta) + X*Y*eta
        w0 = w0 + Y*eta
    else:
        w = w*(1-L*eta)
        w0 = w0
    return (w,w0)


def pegasos(feature_matrix, labels, T, L):
    X=feature_matrix
    Y=labels
    w = np.zeros(len(X[0]))
    w0 = 0
    counter= 0
    (nvectors, dimensions) = feature_matrix.shape
    for t in range(T):

        for i in get_order(nvectors):        
            counter=counter+1
            eta = 1/np.sqrt(counter)
            w,w0=pegasos_single_step_update(X[i,:], Y[i],L, eta, w, w0)

    return (w,w0)    



In [39]:

def bag_of_words(texts):
    """
    Inputs a list of string reviews
    Returns a dictionary of unique unigrams occurring over the input

    Feel free to change this code as guided by Problem 9
    """
    # Your code here
    dictionary = {} # maps word to unique index
    for text in texts:
        word_list = extract_words(text)
        for word in word_list:
            if word not in dictionary:
                dictionary[word] = len(dictionary)
    return dictionary


def extract_bow_feature_vectors(reviews, dictionary):
    """
    Inputs a list of string reviews
    Inputs the dictionary of words as given by bag_of_words
    Returns the bag-of-words feature matrix representation of the data.
    The returned matrix is of shape (n, m), where n is the number of reviews
    and m the total number of entries in the dictionary.

    Feel free to change this code as guided by Problem 9
    """
    # Your code here

    num_reviews = len(reviews)
    feature_matrix = np.zeros([num_reviews, len(dictionary)])

    for i, text in enumerate(reviews):
        word_list = extract_words(text)
        for word in word_list:
            if word in dictionary:
                feature_matrix[i, dictionary[word]] = 1
    return feature_matrix



In [45]:
def load_data(path_data, extras=False):
    """
    Returns a list of dict with keys:
    * sentiment: +1 or -1 if the review was positive or negative, respectively
    * text: the text of the review

    Additionally, if the `extras` argument is True, each dict will also include the
    following information:
    * productId: a string that uniquely identifies each product
    * userId: a string that uniquely identifies each user
    * summary: the title of the review
    * helpfulY: the number of users who thought this review was helpful
    * helpfulN: the number of users who thought this review was NOT helpful
    """

    global PYTHON3

    basic_fields = {'sentiment', 'text'}
    numeric_fields = {'sentiment', 'helpfulY', 'helpfulN'}

    data = []
    if PYTHON3:
        f_data = open(path_data, encoding="latin1")
    else:
        f_data = open(path_data)

    for datum in csv.DictReader(f_data, delimiter='\t'):
        for field in list(datum.keys()):
            if not extras and field not in basic_fields:
                del datum[field]
            elif field in numeric_fields and datum[field]:
                datum[field] = int(datum[field])

        data.append(datum)

    f_data.close()

    return data

In [48]:
# import utils
# import numpy as np
# import project1 as p1

#-------------------------------------------------------------------------------
# Data loading. There is no need to edit code in this section.
#-------------------------------------------------------------------------------

# train_data = utils.load_data('reviews_train.tsv')
# val_data = utils.load_data('reviews_val.tsv')
# test_data = utils.load_data('reviews_test.tsv')

train_data = load_data('reviews_train.tsv')
val_data = load_data('reviews_val.tsv')
test_data = load_data('reviews_test.tsv')

train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data))
val_texts, val_labels = zip(*((sample['text'], sample['sentiment']) for sample in val_data))
test_texts, test_labels = zip(*((sample['text'], sample['sentiment']) for sample in test_data))

dictionary = bag_of_words(train_texts)

train_bow_features = extract_bow_feature_vectors(train_texts, dictionary)
val_bow_features = extract_bow_feature_vectors(val_texts, dictionary)
test_bow_features = extract_bow_feature_vectors(test_texts, dictionary)

#-------------------------------------------------------------------------------
# Problem 5
#-------------------------------------------------------------------------------

# toy_features, toy_labels = toy_data = utils.load_toy_data('toy_data.tsv')

# T = 10
# L = 0.2

# thetas_perceptron = p1.perceptron(toy_features, toy_labels, T)
# thetas_avg_perceptron = p1.average_perceptron(toy_features, toy_labels, T)
# thetas_pegasos = p1.pegasos(toy_features, toy_labels, T, L)

# def plot_toy_results(algo_name, thetas):
#     print('theta for', algo_name, 'is', ', '.join(map(str,list(thetas[0]))))
#     print('theta_0 for', algo_name, 'is', str(thetas[1]))
#     utils.plot_toy_data(algo_name, toy_features, toy_labels, thetas)

# plot_toy_results('Perceptron', thetas_perceptron)
# plot_toy_results('Average Perceptron', thetas_avg_perceptron)
# plot_toy_results('Pegasos', thetas_pegasos)

#-------------------------------------------------------------------------------
# Problem 7
#-------------------------------------------------------------------------------

T = 10
L = 0.01

pct_train_accuracy, pct_val_accuracy = \
   classifier_accuracy(perceptron, train_bow_features,val_bow_features,train_labels,val_labels,T=T)
print("{:35} {:.4f}".format("Training accuracy for perceptron:", pct_train_accuracy))
print("{:35} {:.4f}".format("Validation accuracy for perceptron:", pct_val_accuracy))

# avg_pct_train_accuracy, avg_pct_val_accuracy = \
#    classifier_accuracy(average_perceptron, train_bow_features,val_bow_features,train_labels,val_labels,T=T)
# print("{:43} {:.4f}".format("Training accuracy for average perceptron:", avg_pct_train_accuracy))
# print("{:43} {:.4f}".format("Validation accuracy for average perceptron:", avg_pct_val_accuracy))

# avg_peg_train_accuracy, avg_peg_val_accuracy = \
#    classifier_accuracy(pegasos, train_bow_features,val_bow_features,train_labels,val_labels,T=T,L=L)
# print("{:50} {:.4f}".format("Training accuracy for Pegasos:", avg_peg_train_accuracy))
# print("{:50} {:.4f}".format("Validation accuracy for Pegasos:", avg_peg_val_accuracy))

#-------------------------------------------------------------------------------
# Problem 8
#-------------------------------------------------------------------------------

# data = (train_bow_features, train_labels, val_bow_features, val_labels)
#
# # values of T and lambda to try
# Ts = [1, 5, 10, 15, 25, 50]
# Ls = [0.001, 0.01, 0.1, 1, 10]
#
# pct_tune_results = utils.tune_perceptron(Ts, *data)
# print('perceptron valid:', list(zip(Ts, pct_tune_results[1])))
# print('best = {:.4f}, T={:.4f}'.format(np.max(pct_tune_results[1]), Ts[np.argmax(pct_tune_results[1])]))
#
# avg_pct_tune_results = utils.tune_avg_perceptron(Ts, *data)
# print('avg perceptron valid:', list(zip(Ts, avg_pct_tune_results[1])))
# print('best = {:.4f}, T={:.4f}'.format(np.max(avg_pct_tune_results[1]), Ts[np.argmax(avg_pct_tune_results[1])]))
#
# # fix values for L and T while tuning Pegasos T and L, respective
# fix_L = 0.01
# peg_tune_results_T = utils.tune_pegasos_T(fix_L, Ts, *data)
# print('Pegasos valid: tune T', list(zip(Ts, peg_tune_results_T[1])))
# print('best = {:.4f}, T={:.4f}'.format(np.max(peg_tune_results_T[1]), Ts[np.argmax(peg_tune_results_T[1])]))
#
# fix_T = Ts[np.argmax(peg_tune_results_T[1])]
# peg_tune_results_L = utils.tune_pegasos_L(fix_T, Ls, *data)
# print('Pegasos valid: tune L', list(zip(Ls, peg_tune_results_L[1])))
# print('best = {:.4f}, L={:.4f}'.format(np.max(peg_tune_results_L[1]), Ls[np.argmax(peg_tune_results_L[1])]))
#
# utils.plot_tune_results('Perceptron', 'T', Ts, *pct_tune_results)
# utils.plot_tune_results('Avg Perceptron', 'T', Ts, *avg_pct_tune_results)
# utils.plot_tune_results('Pegasos', 'T', Ts, *peg_tune_results_T)
# utils.plot_tune_results('Pegasos', 'L', Ls, *peg_tune_results_L)

#-------------------------------------------------------------------------------
# Use the best method (perceptron, average perceptron or Pegasos) along with
# the optimal hyperparameters according to validation accuracies to test
# against the test dataset. The test data has been provided as
# test_bow_features and test_labels.
#-------------------------------------------------------------------------------

# Your code here

#-------------------------------------------------------------------------------
# Assign to best_theta, the weights (and not the bias!) learned by your most
# accurate algorithm with the optimal choice of hyperparameters.
#-------------------------------------------------------------------------------

# best_theta = None # Your code here
# wordlist   = [word for (idx, word) in sorted(zip(dictionary.values(), dictionary.keys()))]
# sorted_word_features = utils.most_explanatory_word(best_theta, wordlist)
# print("Most Explanatory Word Features")
# print(sorted_word_features[:10])


NameError: name 'PYTHON3' is not defined