In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import collections
from collections import Counter
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
# data

corpus = pd.read_csv('combined_corpus.csv', sep='\t', encoding='utf-8', header=0, index_col=0)
corpus = corpus.fillna("")
#corpus.head(10)

In [3]:
# baseline

corpus['class'].value_counts() / len(corpus['class'])

TYP    0.861256
ASD    0.138744
Name: class, dtype: float64

In [4]:
# encode labels

encoder = preprocessing.LabelEncoder()
encoder.fit(corpus['class'])
labels = encoder.transform(corpus['class'])

In [5]:
# train, test sets

features = corpus['text']
train_X, test_X, train_y, test_y = train_test_split(features, labels,
                                                    test_size=.33, 
                                                    random_state=55)  

In [6]:
class TokenCounts(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, corpus):
        token_counts = []
        word_tokenize = [nltk.word_tokenize(sentence) for sentence in corpus]
        for word in word_tokenize:
            counter = Counter(word)
            token_counts.append(counter)
        return token_counts

In [7]:
class TagCounts(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, corpus):
        tag_counts = []
        for line in corpus:
            tags = [tag[1] for tag in nltk.pos_tag(line.split())]
            counter = Counter(tags)
            tag_counts.append(counter)
        return tag_counts

In [40]:
# pipeline model

model = Pipeline([
        ('features', FeatureUnion(
            transformer_list=[ 
                #('unigram', Pipeline([
                    #('cvec', CountVectorizer(analyzer='word',
                                             #ngram_range=(1,1),
                                             #max_features=255, 
                                             #max_df=0.5
                                             #)),
                    #])),
                
                #('bigram', Pipeline([
                    #('cvec', CountVectorizer(analyzer='word',  
                                             #ngram_range=(1,2),
                                             #max_features=210, 
                                             #max_df=0.5
                                             #)),
                    #])),
                
                #('tfidf_features', Pipeline([
                    #('cvec', CountVectorizer(analyzer='word',
                                             #stop_words=None,
                                             #ngram_range=(1,2),
                                             #max_features=210, 
                                             #max_df=0.5
                                             #)),
                    #('tfidf', TfidfTransformer(smooth_idf=True,
                                               #use_idf=True)),
                    #])), 
                
                #('char_features', Pipeline([
                    #('tfidf', TfidfVectorizer(analyzer='char',
                                              #ngram_range=(1,2),
                                              #max_df=0.5, 
                                              #max_features=200,
                                              #use_idf=True
                                              #)),
                    #])),
                
                #('NP_features', Pipeline([
                    #('NPs', noun_phrases()),
                    #('dvec', DictVectorizer()),
                    #])),
                
                ('word_features', Pipeline([
                    ('token_counter', TokenCounts()),
                    ('dvec', DictVectorizer()),
                    ])),
                
                ('pos_features', Pipeline([
                    ('tag_counter', TagCounts()),
                    ('dvec', DictVectorizer()),
                    ])),
                
        ])),
        #('mnb', MultinomialNB())            
        ('gbc', GradientBoostingClassifier(random_state=250, 
                                           max_features='auto',
                                           loss="deviance",
                                           n_estimators=950,
                                           criterion='friedman_mse' 
                                           ))
])

In [41]:
# fit model and make predictions

model.fit(train_X, train_y)
predictions = model.predict(test_X)

In [39]:
print(' MNB classifier\n%s' %metrics.classification_report(test_y, predictions))
print('confusion matrix\n%s' % metrics.confusion_matrix(test_y, predictions))
print(f"Accuracy score:\t{format(metrics.accuracy_score(test_y, predictions), '.2f')}")

 MNB classifier
              precision    recall  f1-score   support

           0       0.57      0.18      0.27     12805
           1       0.88      0.98      0.93     79897

    accuracy                           0.87     92702
   macro avg       0.72      0.58      0.60     92702
weighted avg       0.84      0.87      0.84     92702

confusion matrix
[[ 2252 10553]
 [ 1707 78190]]
Accuracy score:	0.87


In [42]:
print('GBC classifier\n%s' %metrics.classification_report(test_y, predictions))
print('confusion matrix\n%s' % metrics.confusion_matrix(test_y, predictions))
print(f"Accuracy score:\t{format(metrics.accuracy_score(test_y, predictions), '.2f')}")

GBC classifier
              precision    recall  f1-score   support

           0       0.90      0.08      0.14     12805
           1       0.87      1.00      0.93     79897

    accuracy                           0.87     92702
   macro avg       0.89      0.54      0.54     92702
weighted avg       0.87      0.87      0.82     92702

confusion matrix
[[  972 11833]
 [  107 79790]]
Accuracy score:	0.87


### Grid Search

In [20]:
parameters = {
        'gbc__random_state': (150, 250, 450),
        'gbc__n_estimators': (550, 800, 950),
}

In [21]:
grid_search = GridSearchCV(model, 
                           parameters, 
                           cv=5, 
                           scoring='accuracy', 
                           refit=True, 
                           n_jobs=-1, 
                           verbose=1)

In [22]:
grid_search.fit(train_X, train_y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 107.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('features',
                                        FeatureUnion(n_jobs=None,
                                                     transformer_list=[('word_features',
                                                                        Pipeline(memory=None,
                                                                                 steps=[('token_counter',
                                                                                         TokenCounts()),
                                                                                        ('dvec',
                                                                                         DictVectorizer(dtype=<class 'numpy.float64'>,
                                                                                                        separator='=',
                                        

In [23]:
grid_search.best_params_

{'gbc__n_estimators': 950, 'gbc__random_state': 250}