# Figure out how to use maxent models to classify text
<a href="https://web.stanford.edu/class/cs124/lec/Maximum_Entropy_Classifiers.pdf">Inspiration from Chris Manning tutorial</a>

## Import libraries

### following these pages:<br>
<a href="http://www.nltk.org/api/nltk.classify.html#module-nltk.classify.scikitlearn">How to wrap a sklearn SGDClassifier into an NLTK classifier</a><br>
<a href="http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html">Details of the sklearn Stochastic Gradient Descent classifier</a>


In [1]:
from sklearn.linear_model import SGDClassifier
from nltk.classify.scikitlearn import SklearnClassifier

In [2]:
from nltk import word_tokenize

## Create training and test data
### for a two-way classification problem, Location or Drug

In [19]:
train = [('in Arcadia', 'LOCATION'),
         (u'in Qu\u00E9bec', 'LOCATION'),
        ('taking Zantac', 'DRUG'),
         ('outside Beynac', "LOCATION")]
test = [('buying aspirin', 'DRUG'),
       ('taking Prozac', 'DRUG'),
       ('in Capetown', 'LOCATION')]

In [20]:
print(train[1][0])
# Make sure we're putting in the accented characters properly

in Québec


## Define functions to extract features

In [23]:
def f1(previousWord, targetWord):
    #True iff C = Location and w-1 is 'in' and isCapitalized(w)
    if previousWord.lower() == 'in' and targetWord[0].isupper():
        return True
    else:
        return False

def f2(previousWord, targetWord):
    latin1_lowerbound = int("0080", 16) #i.e. 128
    latin1_upperbound =  int("00FF", 16) # i.e. 255
    #True iff C= Location and hasAccentedLatinChar(w)
    for char in targetWord:
        if ord(char) >= latin1_lowerbound and ord(char) <= latin1_upperbound:
            return True
    return False

def f3(previousWord, targetWord):
    #True iff C=Drug and endsWith(w, "c")
    if (targetWord.lower().endswith('c')):
        return True
    else:
        return False
 
vectorFuncs = [f1, f2, f3]

## Create feature X doc matrix from training data

In [24]:
#Function to vectorize a list of two-word phrases with three features
def vectorize(labeledStrings, myVectorFuncs):
    '''
    Pass it a list of tuples of strings and class labels
    get back a list of feature vectors and a list of class labels
    '''
    vectorMatrix = []
    class_labels = []
   
    for (phrase, category) in labeledStrings:
        class_labels.append(category)
        tokens = word_tokenize(phrase)
        vector = []
        assert(len(tokens) == 2)
        for i in [0,1,2]:
            vector.append(myVectorFuncs[i](tokens[0], tokens[1]))
        vectorMatrix.append(vector)
    
    return (vectorMatrix, class_labels)

#Make sure it worked as expected
train_X, class_labels_train = vectorize(train, vectorFuncs)
print(train_X)
print(class_labels_train)
    

[[True, False, False], [True, True, True], [False, False, True], [False, False, True]]
['LOCATION', 'LOCATION', 'DRUG', 'LOCATION']


In [7]:
#Vectorize test data, check result makes sense
test_X, class_labels_test = vectorize(test)
print(test_X)
print(class_labels_test)

[[False, False, False], [False, False, True], [True, False, False]]
['DRUG', 'DRUG', 'LOCATION']


## Instantiate and train maxent classifier

In [26]:
clf = SGDClassifier(loss="log", max_iter = 1000)
#The loss="log" parameter specifies a logistic regression model, not an SVM classifier
# the max_iter supposedly is the number of iterations of training used

In [27]:
clf.fit(train_X, class_labels_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

## Run on test data

In [28]:
clf.predict(test_X)

array(['LOCATION', 'LOCATION', 'LOCATION'],
      dtype='<U8')

In [29]:
clf.get_params()

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'epsilon': 0.1,
 'eta0': 0.0,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'optimal',
 'loss': 'log',
 'max_iter': 1000,
 'n_iter': None,
 'n_jobs': 1,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'tol': None,
 'verbose': 0,
 'warm_start': False}

## Extract the coefficients for the model

In [30]:
# These are the coefficients of the model
clf.coef_

array([[ 3.76170401,  1.4276586 , -1.53987417]])

In [31]:
# This is the model intercept
clf.intercept_

array([ 3.23263415])

In [32]:
clf.predict_proba(test_X)

array([[  3.79559432e-02,   9.62044057e-01],
       [  1.55413221e-01,   8.44586779e-01],
       [  9.16219317e-04,   9.99083781e-01]])

## Can we replicate the predicted probabilities from the coefficients?

In [33]:
#prob(input = location) = 1 / 1 + e ^^ -z
#z = B0 + B1X1 + ... BnXn
from math import exp
from functools import reduce
#ez = exp(-(clf.intercept_ + ([(clf.coef_[0][i] * test_X[0][i]) + (clf.coef_[0][1] * test_X[0][1]) + (clf.coef_[0][2] * test_X[0][2])))
ez = exp(-(clf.intercept_ + sum([(clf.coef_[0][i] * test_X[0][i]) for i in range(0, len(vectorFuncs))])))
predict_1 = (1 / (1 + ez))
print(predict_1)

0.9620440567724402


## Success

In [44]:
import statsmodels.api as sm
import statsmodels.discrete.discrete_model as sm2
import pandas as pd

In [53]:
train_df = pd.DataFrame.from_records(train_X, columns=['f1', 'f2', 'f3'])
class_labels_df = pd.DataFrame(class_labels_train, columns=['label'])
class_labels_df['response'] = 0
class_labels_df.loc[class_labels_df['label'] == 'LOCATION','response'] = 1
print(class_labels_df)

      label  response
0  LOCATION         1
1  LOCATION         1
2      DRUG         0
3  LOCATION         1


In [57]:
logit = sm.Logit(class_labels_df['response'], train_X)

In [58]:
result = logit.fit()

         Current function value: 0.346574
         Iterations: 35




In [60]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:               response   No. Observations:                    4
Model:                          Logit   Df Residuals:                        1
Method:                           MLE   Df Model:                            2
Date:                Tue, 24 Apr 2018   Pseudo R-squ.:                  0.3837
Time:                        15:12:33   Log-Likelihood:                -1.3863
converged:                      False   LL-Null:                       -2.2493
                                        LLR p-value:                    0.4219
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            27.3061    8.5e+05   3.21e-05      1.000   -1.67e+06    1.67e+06
x2             3.0167   3.94e+06   7.67e-07      1.000   -7.71e+06    7.71e+06
x3          1.353e-13      1.414   9.56e-14      1.0