# Figure out how to use maxent models to classify text
<a href="https://web.stanford.edu/class/cs124/lec/Maximum_Entropy_Classifiers.pdf">Inspiration from Chris Manning tutorial</a>

## Import libraries

### following these pages:<br>
<a href="http://www.nltk.org/api/nltk.classify.html#module-nltk.classify.scikitlearn">How to wrap a sklearn SGDClassifier into an NLTK classifier</a><br>
<a href="http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html">Details of the sklearn Stochastic Gradient Descent classifier</a>


In [6]:
from sklearn.linear_model import SGDClassifier
from nltk.classify.scikitlearn import SklearnClassifier

In [22]:
from nltk import word_tokenize

## Create training and test data
### for a two-way classification problem, Location or Drug

In [71]:
train = [('in Arcadia', 'LOCATION'),
         (u'in Qu\u00E9bec', 'LOCATION'),
        ('taking Zantac', 'DRUG'),
         ('outside Beynac', "LOCATION")]
test = [('buying aspirin', 'DRUG'),
       ('taking Prozac', 'DRUG'),
       ('in Capetown', 'LOCATION')]

In [74]:
print(train[1][0])
# Make sure we're putting in the accented characters properly

in Québec


## Define functions to extract features

In [75]:
def f1(previousWord, targetWord):
    #True iff C = Location and w-1 is 'in' and isCapitalized(w)
    if previousWord.lower() == 'in' and targetWord[0].isupper():
        return True
    else:
        return False

def f2(previousWord, targetWord):
    latin1_lowerbound = int("0080", 16) #i.e. 128
    latin1_upperbound =  int("00FF", 16) # i.e. 255
    #True iff C= Location and hasAccentedLatinChar(w)
    for char in targetWord:
        if ord(char) >= latin1_lowerbound and ord(char) <= latin1_upperbound:
            return True
    return False

def f3(previousWord, targetWord):
    #True iff C=Drug and endsWith(w, "c")
    if (targetWord.lower().endswith('c')):
        return True
    else:
        return False


## Create feature X doc matrix from training data

In [76]:
#Function to vectorize a list of two-word phrases with three features
def vectorize(labeledStrings):
    '''
    Pass it a list of tuples of strings and class labels
    get back a list of feature vectors and a list of class labels
    '''
    vectorMatrix = []
    class_labels = []
    vectorFuncs = [f1, f2, f3]
    for (phrase, category) in labeledStrings:
        class_labels.append(category)
        tokens = word_tokenize(phrase)
        vector = []
        assert(len(tokens) == 2)
        for i in [0,1,2]:
            vector.append(vectorFuncs[i](tokens[0], tokens[1]))
        vectorMatrix.append(vector)
    
    return (vectorMatrix, class_labels)

#Make sure it worked as expected
train_X, class_labels_train = vectorize(train)
print(train_X)
print(class_labels_train)
    

[[True, False, False], [True, True, True], [False, False, True], [False, False, True]]
['LOCATION', 'LOCATION', 'DRUG', 'LOCATION']


In [77]:
#Vectorize test data, check result makes sense
test_X, class_labels_test = vectorize(test)
print(test_X)
print(class_labels_test)

[[False, False, False], [False, False, True], [True, False, False]]
['DRUG', 'DRUG', 'LOCATION']


## Instantiate and train maxent classifier

In [78]:
clf = SGDClassifier(loss="log", max_iter = 1000)
#The loss="log" parameter specifies a logistic regression model, not an SVM classifier
# the max_iter supposedly is the number of iterations of training used

In [79]:
clf.fit(train_X, class_labels_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

## Run on test data

In [80]:
clf.predict(test_X)

array(['LOCATION', 'LOCATION', 'LOCATION'],
      dtype='<U8')

In [81]:
clf.get_params()

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'log',
 'max_iter': 1000,
 'n_iter': None,
 'n_jobs': 1,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'tol': None,
 'verbose': 0,
 'warm_start': False}

## Extract the coefficients for the model

In [82]:
# These are the coefficients of the model
clf.coef_

array([[ 5.10058785,  2.66455433, -1.78222997]])

In [83]:
# This is the model intercept
clf.intercept_

array([ 3.40340975])

In [84]:
clf.predict_proba(test_X)

array([[  3.21890715e-02,   9.67810928e-01],
       [  1.65042229e-01,   8.34957771e-01],
       [  2.02615547e-04,   9.99797384e-01]])

## Can we replicate the predicted probabilities from the coefficients?

In [87]:
#prob(input = location) = 1 / 1 + e ^^ -z
#z = B0 + B1X1 + ... BnXn
from math import exp
ez = exp(-(clf.intercept_ + (clf.coef_[0][0] * test_X[0][0]) + (clf.coef_[0][1] * test_X[0][1]) + (clf.coef_[0][2] * test_X[0][2])))
predict_1 = (1 / (1 + ez))
print(predict_1)

0.9678109284919212


## Success