In [13]:
import numpy as np
import matplotlib.pyplot as plt


In [136]:
theta = np.linspace(0, 1, 101)
x = [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]

In [137]:
def likelihood(theta, x, n):
    l = np.ones([len(theta), 1])
    for i in range(0,len(theta)):
        for j in range(0,n):
                l[i] *= theta[i]**x[j]*(1-theta[i])**(1-x[j]) 
    return l
    

In [138]:
l_theta = likelihood(theta, x, 10)

In [141]:
plt.figure(1)
plt.plot(theta, l_theta)
#plt.annotate('max', xy=(theta[l_theta.argmax()], l_theta[60]), xytext=(0.4, .0010),
#            arrowprops=dict(facecolor='black', shrink=0.05),
#            )
plt.axvline(x=theta[l_theta.argmax()])
plt.xlabel('$\Theta$')
plt.ylabel('$L(\Theta)$')
plt.savefig('p1_1b.jpg')
#plt.show()

plt.close()

In [186]:
x_d_1 = [1, 1, 1, 0,0]
x_d_2 = 60*[1] + 40*[0]
x_d_3 = 5*[1] + 5*[0]

In [187]:
l_theta_1 = likelihood(theta, x_d_1, 5)
l_theta_2 = likelihood(theta, x_d_2, 100)
l_theta_3 = likelihood(theta, x_d_3, 10)


In [233]:
plt.figure(2)

plt.subplot(221)
plt.plot(theta, l_theta_1)
plt.xlabel('$\Theta$')
plt.ylabel('$L(\Theta)$')
plt.title('n = 5')

plt.subplot(222)
plt.plot(theta, l_theta_2)      
plt.xlabel('$\Theta$')
plt.ylabel('$L(\Theta)$')
plt.title('n = 100')

plt.subplot(223)
plt.plot(theta, l_theta_3)
plt.xlabel('$\Theta$')
plt.ylabel('$L(\Theta)$')
plt.title('n = 10')
plt.savefig('p1_1d.jpg')
plt.close()

In [14]:
# Use only the provided packages!
import math
import csv
from util import *
from collections import Counter

from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [3]:
######################################################################
# classes
######################################################################

class Classifier(object) :
    """
    Classifier interface.
    """
    
    def fit(self, X, y):
        raise NotImplementedError()
        
    def predict(self, X):
        raise NotImplementedError()


class MajorityVoteClassifier(Classifier) :
    
    def __init__(self) :
        """
        A classifier that always predicts the majority class.
        
        Attributes
        --------------------
            prediction_ -- majority class
        """
        self.prediction_ = None
    
    def fit(self, X, y) :
        """
        Build a majority vote classifier from the training set (X, y).
        
        Parameters
        --------------------
            X    -- numpy array of shape (n,d), samples
            y    -- numpy array of shape (n,), target classes
        
        Returns
        --------------------
            self -- an instance of self
        """
        majority_val = Counter(y).most_common(1)[0][0]
        self.prediction_ = majority_val
        return self
    
    def predict(self, X) :
        """
        Predict class values.
        
        Parameters
        --------------------
            X    -- numpy array of shape (n,d), samples
        
        Returns
        --------------------
            y    -- numpy array of shape (n,), predicted classes
        """
        if self.prediction_ is None :
            raise Exception("Classifier not initialized. Perform a fit first.")
        
        n,d = X.shape
        y = [self.prediction_] * n 
        return y


In [85]:
class RandomClassifier(Classifier) :
    
    def __init__(self) :
        """
        A classifier that predicts according to the distribution of the classes.
        
        Attributes
        --------------------
            probabilities_ -- class distribution dict (key = class, val = probability of class)
        """
        self.probabilities_ = None
    
    def fit(self, X, y) :
        """
        Build a random classifier from the training set (X, y).
        
        Parameters
        --------------------
            X    -- numpy array of shape (n,d), samples
            y    -- numpy array of shape (n,), target classes
        
        Returns
        --------------------
            self -- an instance of self
        """
        
        ### ========== TODO : START ========== ###
        # part b: set self.probabilities_ according to the training set
        
        unique, counts = np.unique(y, return_counts=True)        
        self.probabilities_ = dict(zip(unique, counts/len(y)))
        
        ### ========== TODO : END ========== ###
        
        return self
    
    def predict(self, X, seed=1234) :
        """
        Predict class values.
        
        Parameters
        --------------------
            X    -- numpy array of shape (n,d), samples
            seed -- integer, random seed
        
        Returns
        --------------------
            y    -- numpy array of shape (n,), predicted classes
        """
        if self.probabilities_ is None :
            raise Exception("Classifier not initialized. Perform a fit first.")
        np.random.seed(seed)
        
        ### ========== TODO : START ========== ###
        # part b: predict the class for each test example
        # hint: use np.random.choice (be careful of the parameters)
        
        y = np.random.choice([0, 1], size=X.shape[0],replace=True ,p=[self.probabilities_[0],self.probabilities_[1]])
        
        ### ========== TODO : END ========== ###
        
        return y


In [143]:
######################################################################
# functions
######################################################################
def plot_histograms(X, y, Xnames, yname) :
    n,d = X.shape  # n = number of examples, d =  number of features
    fig = plt.figure(figsize=(20,15))
    nrow = 3; ncol = 3
    for i in range(0, d) :
        fig.add_subplot (3,3,i+1)  
        data, bins, align, labels = plot_histogram(X[:,i], y, Xname=Xnames[i], yname=yname, show = False)
        n, bins, patches = plt.hist(data, bins=bins, align=align, alpha=0.5, label=labels)
        plt.xlabel(Xnames[i])
        plt.ylabel('Frequency')
        plt.legend() #plt.legend(loc='upper left')
 
    plt.savefig('histograms.jpg')


def plot_histogram(X, y, Xname, yname, show = True) :
    """
    Plots histogram of values in X grouped by y.
    
    Parameters
    --------------------
        X     -- numpy array of shape (n,d), feature values
        y     -- numpy array of shape (n,), target classes
        Xname -- string, name of feature
        yname -- string, name of target
    """
    
    # set up data for plotting
    targets = sorted(set(y))
    data = []; labels = []
    for target in targets :
        features = [X[i] for i in range(0, len(y)) if y[i] == target]
        data.append(features)
        labels.append('%s = %s' % (yname, target))
    
    # set up histogram bins
    features = set(X)
    nfeatures = len(features)
    test_range = range(int(math.floor(min(features))), int(math.ceil(max(features)))+1)
    if nfeatures < 10 and sorted(features) == test_range:
        bins = test_range + [test_range[-1] + 1] # add last bin
        align = 'left'
    else :
        bins = 10
        align = 'mid'
    
    # plot
    if show == True:
        plt.figure()
        n, bins, patches = plt.hist(data, bins=bins, align=align, alpha=0.5, label=labels)
        plt.xlabel(Xname)
        plt.ylabel('Frequency')
        plt.legend() #plt.legend(loc='upper left')
        plt.show()

    return data, bins, align, labels

In [126]:
def error(clf, X, y, ntrials=100, test_size=0.2) :
    """
    Computes the classifier error over a random split of the data,
    averaged over ntrials runs.
    
    Parameters
    --------------------
        clf         -- classifier
        X           -- numpy array of shape (n,d), features values
        y           -- numpy array of shape (n,), target classes
        ntrials     -- integer, number of trials
    
    Returns
    --------------------
        train_error -- float, training error
        test_error  -- float, test error
    """
    
    ### ========== TODO : START ========== ###
    # compute cross-validation error over ntrials
    # hint: use train_test_split (be careful of the parameters)
    train_error = 0.0
    test_error = 0.0
    for i in range(0, 100):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = i)
        clf.fit(X_train, y_train)
        y_pred_train= clf.predict(X_train)
        train_error += 1 - metrics.accuracy_score(y_pred_train, y_train, normalize=True)
        y_pred_test = clf.predict(X_test)
        test_error += 1 - metrics.accuracy_score(y_pred_test, y_test, normalize=True)
    
    train_error = train_error/100
    test_error = test_error/100
    
    ### ========== TODO : END ========== ###
    
    return train_error, test_error


In [88]:
def write_predictions(y_pred, filename, yname=None) :
    """Write out predictions to csv file."""
    out = open(filename, 'wb')
    f = csv.writer(out)
    if yname :
        f.writerow([yname])
    f.writerows(zip(y_pred))
    out.close()

In [144]:
######################################################################
# main
######################################################################

def main():
    # load Titanic dataset
    titanic = load_data("titanic_train.csv", header=1, predict_col=0)
    X = titanic.X; Xnames = titanic.Xnames
    y = titanic.y; yname = titanic.yname
    n,d = X.shape  # n = number of examples, d =  number of features
    
    
    
    #========================================
    #part a: plot histograms of each feature
    #print 'Plotting...'
    #for i in range(0, d) :
    plot_histograms(X, y, Xnames, yname=yname)
    
       
    #========================================
    # train Majority Vote classifier on data
    #print 'Classifying using Majority Vote...'
    #clf = MajorityVoteClassifier() # create MajorityVote classifier, which includes all model parameters
    #clf.fit(X, y)                  # fit training data using the classifier
    #y_pred = clf.predict(X)        # take the classifier and run it on the training data
    #train_error = 1 - metrics.accuracy_score(y, y_pred, normalize=True)
    #print('\t-- training error: %.3f' % train_error)
    
    
    ### ========== TODO : START ========== ###
    # part b: evaluate training error of Random classifier
    #print 'Classifying using Random...'
    #clf_rand = RandomClassifier() # create MajorityVote classifier, which includes all model parameters
    #clf_rand.fit(X, y)                  # fit training data using the classifier
    #y_pred_rand = clf_rand.predict(X)        # take the classifier and run it on the training data
    #train_error = 1 - metrics.accuracy_score(y, y_pred_rand, normalize=True)
    #print('\t-- training error: %.3f' % train_error)
    
    
    ### ========== TODO : END ========== ###
    
    
    
    ### ========== TODO : START ========== ###
    # part c: evaluate training error of Decision Tree classifier
    # use criterion of "entropy" for Information gain 
    #print 'Classifying using Decision Tree...'
    #clf = DecisionTreeClassifier(criterion='entropy',random_state=0)
    #clf.fit(X, y)    
    #y_pred = clf.predict(X) 
    #train_error = 1 - metrics.accuracy_score(y, y_pred, normalize=True)
    #print('\t-- training error: %.3f' % train_error)
    ### ========== TODO : END ========== ###
    
    
    
    # note: uncomment out the following lines to output the Decision Tree graph
    """
    # save the classifier -- requires GraphViz and pydot
    import StringIO, pydot
    from sklearn import tree
    dot_data = StringIO.StringIO()
    tree.export_graphviz(clf, out_file=dot_data,
                         feature_names=Xnames)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("dtree.pdf") 
    """
    
    
    
    ### ========== TODO : START ========== ###
    # part d: use cross-validation to compute average training and test error of classifiers
    """
    clf_maj = MajorityVoteClassifier()
    clf_rand = RandomClassifier()
    #clf_dec = DecisionTreeClassifier(criterion='entropy')
    train_error_maj, test_error_maj = error(clf_maj, X, y)
    train_error_rand, test_error_rand = error(clf_rand, X, y)
    #train_error_dec, test_error_dec = error(clf_dec, X, y)
    
    #print('Investigating various classifiers...')
    #print('Majority Vote Classifier Train Error: ', train_error_maj, '\nMajority Vote Classifier Test Error: ', test_error_maj)
    #print('\nRandom Classifier Train Error: ', train_error_rand, '\nRandom Classifier Test Error: ', test_error_rand)
    #print('\nDecision Tree Classifier Train Error: ', train_error_dec, '\nDecision Tree Classifier Test Error: ', test_error_dec)


    ### ========== TODO : END ========== ###
    
    
    
    ### ========== TODO : START ========== ###
    # part e: investigate decision tree classifier with various depths
    
    print('Investigating depths...')
    
    train_error = [0]*20
    test_error = [0]*20
    depth = range(1, 21)
    for i in range(1, 21):
        clf_dec = DecisionTreeClassifier(criterion='entropy', max_depth = i)
        train_error[i-1], test_error[i-1] = error(clf_dec, X, y)
    
    test_error_maj_depth = [test_error_maj]*20
    test_error_rand_depth = [test_error_rand]*20
    
    plt.figure()
    plt.plot(depth,train_error, 'r',label='Decision Tree Training Error')
    plt.plot(depth,test_error, '-',label='Decision Tree Testing Error')
    plt.plot(depth, test_error_maj_depth, 'b',label='Majority Vote Classifier Test Error')
    plt.plot(depth, test_error_rand_depth, 'g', label='Random Classsifier Test Error')
    plt.xlabel('Depth Level')
    plt.ylabel('Error')
    plt.legend(loc=2)
    x1,x2,y1,y2 = plt.axis()
    plt.axis((x1,x2,0,1.0))

    plt.savefig('p1_4e.jpg')
    plt.close()
    #plt.show()
    
    
    ### ========== TODO : END ========== ###
    """
    
    
    ### ========== TODO : START ========== ###
    # part f: investigate decision tree classifier with various training set sizes
    #print 'Investigating training set sizes...'
    """
    train_error_dec = [0]*19
    test_error_dec = [0]*19
    test_error_maj = [0]*19
    test_error_rand = [0]*19
    j = 0
    for i in range(5, 100, 5):
        size = (1-i/100)
        clf_maj = MajorityVoteClassifier()
        clf_rand = RandomClassifier()
        clf_dec = DecisionTreeClassifier(criterion='entropy', max_depth = 3)
        train_error_maj, test_error_maj[j] = error(clf_maj, X, y, test_size=size)
        train_error_rand, test_error_rand[j] = error(clf_rand, X, y, test_size = size)
        train_error_dec[j], test_error_dec[j] = error(clf_dec, X, y, test_size = size)
        j +=1
    
    x_axis = np.linspace(.05, .95, 19)
    
    plt.figure()
    plt.plot(x_axis, train_error_dec, 'r',label='Decision Tree Training Error')
    plt.plot(x_axis, test_error_dec, '-',label='Decision Tree Testing Error')
    plt.plot(x_axis, test_error_maj, 'b',label='Majority Vote Classifier Test Error')
    plt.plot(x_axis, test_error_rand, 'g', label='Random Classsifier Test Error')
    plt.xlabel('Training Size')
    plt.ylabel('Error')
    plt.legend(loc=2)
    x1,x2,y1,y2 = plt.axis()
    plt.axis((x1,x2,0,1.0))
    plt.savefig('p1_4f.jpg')
    plt.close()
                     
                      
    ### ========== TODO : END ========== ###
    
       
    #print 'Done'
    """

if __name__ == "__main__":
    main()
