In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm, linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier

data = pd.read_csv('./twitter-airline-sentiment/Tweets.csv')

In [None]:
y_raw = data['airline_sentiment']
# labeling data
le = preprocessing.LabelEncoder()
le.fit(y_raw)
le.classes_
y = le.transform(y_raw)

In [None]:
#from nltk.tokenize import word_tokenize
tweet = data['text']

In [None]:
# get word counts
count_vect = CountVectorizer()
X = count_vect.fit_transform(tweet)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin, clone

class OneVsAllClassifier(BaseEstimator, ClassifierMixin):  
    """
    One-vs-all classifier
    We assume that the classes will be the integers 0,..,(n_classes-1).
    We assume that the estimator provided to the class, after fitting, has a "decision_function" that 
    returns the score for the positive class.
    """
    def __init__(self, estimator, n_classes):      
        """
        Constructed with the number of classes and an estimator (e.g. an
        SVM estimator from sklearn)
        @param estimator : binary base classifier used
        @param n_classes : number of classes
        """
        self.n_classes = n_classes 
        self.estimators = [clone(estimator) for _ in range(n_classes)]
        self.fitted = False

    def fit(self, X, y=None):
        """
        This should fit one classifier for each class.
        self.estimators[i] should be fit on class i vs rest
        @param X: array-like, shape = [n_samples,n_features], input data
        @param y: array-like, shape = [n_samples,] class labels
        @return returns self
        """
        #Your code goes here
        # design a binary label of y for each estimator
        for i in range(self.n_classes):
            y_bi = np.copy(y)
            y_bi[y_bi!=i]=-1
            self.estimators[i].fit(X,y_bi)
        self.fitted = True  
        return self   

    def decision_function(self, X):
        """
        Returns the score of each input for each class. Assumes
        that the given estimator also implements the decision_function method (which sklearn SVMs do), 
        and that fit has been called.
        @param X : array-like, shape = [n_samples, n_features] input data
        @return array-like, shape = [n_samples, n_classes]
        """
        if not self.fitted:
            raise RuntimeError("You must train classifer before predicting data.")

        if not hasattr(self.estimators[0], "decision_function"):
            raise AttributeError(
                "Base estimator doesn't have a decision_function attribute.")
        
        #Replace the following return statement with your code
        score = np.zeros([X.shape[0],self.n_classes])
        for i in range(self.n_classes):
            score[:,i] = self.estimators[i].decision_function(X)
        return score
    
    def predict(self, X):
        """
        Predict the class with the highest score.
        @param X: array-like, shape = [n_samples,n_features] input data
        @returns array-like, shape = [n_samples,] the predicted classes for each input
        """
        #Replace the following return statement with your code
        score = self.decision_function(X)
        prediction = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            prediction[i] = np.argmax(score[i])
        return prediction

 

In [None]:
#Here we test the OneVsAllClassifier
svm_estimator = svm.LinearSVC(loss='hinge', fit_intercept=False, C=0.15)
clf_onevsall = OneVsAllClassifier(svm_estimator, n_classes=3)
clf_onevsall.fit(X_train,y_train)

for i in range(3):
    print("Coeffs %d"%i)
    print(clf_onevsall.estimators[i].coef_) #Will fail if you haven't implemented fit yet

loss = []
for i in range(len(y_valid)):
    loss.append(int(y_valid[i]!=int(clf_onevsall.predict(X_valid)[i])))
print(1-np.sum(loss)/len(y_valid))
    
from sklearn import metrics
metrics.confusion_matrix(y_valid, clf_onevsall.predict(X_valid))

In [None]:
clf = svm.SVC(C=100,kernel='linear')
clf.fit(X_train,y_train)

loss = []
for i in range(len(y_valid)):
    loss.append(int(y_valid[i]!=int(clf.predict(X_valid)[i])))
print(1-np.sum(loss)/len(y_valid))

metrics.confusion_matrix(y_valid, clf.predict(X_valid))

In [None]:
ovo = OneVsOneClassifier(svm.LinearSVC(C=0.25))
ovo.fit(X_train, y_train)

loss = []
for i in range(len(y_valid)):
    loss.append(int(y_valid[i]!=int(ovo.predict(X_valid)[i])))
print(1-np.sum(loss)/len(y_valid))

metrics.confusion_matrix(y_valid, ovo.predict(X_valid))

In [None]:
# gradient boosting
clf_gbm = GradientBoostingClassifier()
clf_gbm.fit(X_train,y_train)

loss = []
for i in range(len(y_valid)):
    loss.append(int(y_valid[i]!=int(clf_gbm.predict(X_valid.toarray())[i])))
print(1-np.sum(loss)/len(y_valid))

metrics.confusion_matrix(y_valid, clf_gbm.predict(X_valid.toarray()))

In [None]:
# random forest
clf_rfc = RandomForestClassifier(n_estimators=50,criterion='entropy')
clf_rfc = clf_rfc.fit(X_train,y_train)

loss = []
for i in range(len(y_valid)):
    loss.append(int(y_valid[i]!=int(clf_rfc.predict(X_valid)[i])))
print(1-np.sum(loss)/len(y_valid))

metrics.confusion_matrix(y_valid, clf_rfc.predict(X_valid))

In [None]:
clf=linear_model.LogisticRegression(C=0.4)
clf.fit(X_train,y_train)

loss = []
for i in range(len(y_valid)):
    loss.append(int(y_valid[i]!=int(clf.predict(X_valid)[i])))
print(np.sum(loss))

metrics.confusion_matrix(y_valid, clf.predict(X_valid))

In [None]:
tweet_train, tweet_valid, ty_train, ty_valid = train_test_split(tweet, y, test_size=0.2, random_state=42)

In [None]:
ty_valid.shape

In [None]:
vectorizer = TfidfVectorizer(min_df=5,max_df = 0.8,sublinear_tf=True,use_idf=True)
tX_train = vectorizer.fit_transform(tweet_train)
tX_valid = vectorizer.transform(tweet_valid)

In [None]:
svm_estimator = svm.LinearSVC(loss='hinge', fit_intercept=False, C=0.85)
clf_onevsall = OneVsAllClassifier(svm_estimator, n_classes=3)
clf_onevsall.fit(tX_train,ty_train)

for i in range(3):
    print("Coeffs %d"%i)
    print(clf_onevsall.estimators[i].coef_) #Will fail if you haven't implemented fit yet

loss = []
for i in range(len(ty_valid)):
    loss.append(int(ty_valid[i]!=int(clf_onevsall.predict(tX_valid)[i])))
print(1-np.sum(loss)/len(ty_valid))
    
metrics.confusion_matrix(ty_valid, clf_onevsall.predict(tX_valid))

In [None]:
clf = svm.SVC(C=100,kernel='linear')
clf.fit(tX_train,ty_train)

loss = []
for i in range(len(ty_valid)):
    loss.append(int(ty_valid[i]!=int(clf.predict(tX_valid)[i])))
print(1-np.sum(loss)/len(ty_valid))

metrics.confusion_matrix(ty_valid, clf.predict(tX_valid))

In [None]:
ovo = OneVsOneClassifier(svm.LinearSVC(C=0.25))
ovo.fit(tX_train, ty_train)

loss = []
for i in range(len(ty_valid)):
    loss.append(int(ty_valid[i]!=int(ovo.predict(tX_valid)[i])))
print(1-np.sum(loss)/len(ty_valid))

metrics.confusion_matrix(ty_valid, ovo.predict(tX_valid))

In [None]:
clf_gbm = GradientBoostingClassifier()
clf_gbm.fit(tX_train,ty_train)

loss = []
for i in range(len(ty_valid)):
    loss.append(int(ty_valid[i]!=int(clf_gbm.predict(tX_valid.toarray())[i])))
print(1-np.sum(loss)/len(ty_valid))

metrics.confusion_matrix(ty_valid, clf_gbm.predict(tX_valid.toarray()))

In [None]:
clf_rfc = RandomForestClassifier(n_estimators=10,criterion='entropy')
clf_rfc = clf_rfc.fit(tX_train,ty_train)

loss = []
for i in range(len(ty_valid)):
    loss.append(int(ty_valid[i]!=int(clf_rfc.predict(tX_valid)[i])))
print(1-np.sum(loss)/len(ty_valid))

metrics.confusion_matrix(ty_valid, clf_rfc.predict(tX_valid))

In [None]:
clf=linear_model.LogisticRegression(C=0.4)
clf.fit(X_train,y_train)

loss = []
for i in range(len(y_valid)):
    loss.append(int(y_valid[i]!=int(clf.predict(X_valid)[i])))
print(np.sum(loss))

metrics.confusion_matrix(y_valid, clf.predict(X_valid))

In [None]:
def zeroOne(y,a) :
    '''
    Computes the zero-one loss.
    @param y: output class
    @param a: predicted class
    @return 1 if different, 0 if same
    '''
    return int(y != a)

def featureMap(X,y,num_classes) :
    '''
    Computes the class-sensitive features.
    @param X: array-like, shape = [n_samples,n_inFeatures] or [n_inFeatures,], input features for input data
    @param y: a target class (in range 0,..,num_classes-1)
    @return array-like, shape = [n_samples,n_outFeatures], the class sensitive features for class y
    '''
    #The following line handles X being a 1d-array or a 2d-array
    num_samples, num_inFeatures = (1,X.shape[0]) if len(X.shape) == 1 else (X.shape[0],X.shape[1])
    
    psi = np.zeros([num_samples, num_inFeatures * num_classes])
    for i in range(num_samples):
        for j in range(num_inFeatures):
            psi[i,y * num_inFeatures + j] = X[i,j]
    return psi



class MulticlassSVM(BaseEstimator, ClassifierMixin):
    '''
    Implements a Multiclass SVM estimator.
    '''
    def __init__(self, num_outFeatures, lam=1.0, num_classes=3, Delta=zeroOne, Psi=featureMap):       
        '''
        Creates a MulticlassSVM estimator.
        @param num_outFeatures: number of class-sensitive features produced by Psi
        @param lam: l2 regularization parameter
        @param num_classes: number of classes (assumed numbered 0,..,num_classes-1)
        @param Delta: class-sensitive loss function taking two arguments (i.e., target margin)
        @param Psi: class-sensitive feature map taking two arguments
        '''
        self.num_outFeatures = num_outFeatures
        self.lam = lam
        self.num_classes = num_classes
        self.Delta = Delta
        self.Psi = lambda X,y : Psi(X,y,num_classes)
        self.fitted = False
    
    def subgradient(self,X,y, eta = 0.1, T = 10000):
        '''
        Computes the subgradient at a given data point x,y
        @param x: sample input
        @param y: sample class
        @param w: parameter vector
        @return returns subgradient vector at given x,y,w
        '''
        
        num_samples = X.shape[0]
        
        #np.random.seed(2)
        random_ind = np.array(range(num_samples))
        np.random.shuffle(random_ind)
        t = 1
        w = np.zeros(self.num_outFeatures)
        while t <= T:
            for ind in random_ind:
                y_max = np.zeros(self.num_classes)
                for y_i in range(self.num_classes):
                    #ind = np.where(X==x)[0][0]
                    y_max[y_i] = self.Delta(y[ind],y_i) + np.dot(self.Psi(X,y_i)[ind] - self.Psi(X,y[ind])[ind],w)
                y_hat = np.argmax(y_max)
        
                subgrad = 2 * self.lam * w + (self.Psi(X,y_hat)[ind] - self.Psi(X,y[ind])[ind])
                w = w - subgrad * eta
                t = t + 1
        return w
        
    def fit(self,X,y,eta=0.1,T=10000):
        '''
        Fits multiclass SVM
        @param X: array-like, shape = [num_samples,num_inFeatures], input data
        @param y: array-like, shape = [num_samples,], input classes
        @param eta: learning rate for SGD
        @param T: maximum number of iterations
        @return returns self
        '''
        self.coef_ = self.subgradient(X,y,eta = 0.1, T = 100)
        self.fitted = True
        return self
    
    def decision_function(self, X):
        '''
        Returns the score on each input for each class. Assumes
        that fit has been called.
        @param X : array-like, shape = [n_samples, n_inFeatures]
        @return array-like, shape = [n_samples, n_classes] giving scores for each sample,class pairing
        '''
        if not self.fitted:
            raise RuntimeError("You must train classifer before predicting data.")

        score = np.zeros([X.shape[0],self.num_classes])
        for j in range(self.num_classes):
            score[:,j] = np.dot(self.Psi(X,j),self.coef_)
        return score
            
    def predict(self, X):
        '''
        Predict the class with the highest score.
        @param X: array-like, shape = [n_samples, n_inFeatures], input data to predict
        @return array-like, shape = [n_samples,], class labels predicted for each data point
        '''

        score = self.decision_function(X)
        prediction = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            prediction[i] = np.argmax(score[i])
        return prediction

In [None]:
#the following code tests the MulticlassSVM and sgd
#will fail if MulticlassSVM is not implemented yet
est = MulticlassSVM(45153,lam=1)
est.fit(X_train,y_train,eta=0.01) # choose a smaller eta of 0.01
print("w:")
print(est.coef_)

from sklearn import metrics
metrics.confusion_matrix(y_test, est.predict(X_test))