In [12]:
import numpy as np
import pandas as pd
import pymongo as pm
import pickle
import re
import json
import lxml.etree as le
from nltk.tokenize import RegexpTokenizer
from sklearn.externals import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

item selector to grab feature I want to work with in pipe.

see http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html#sphx-glr-auto-examples-hetero-feature-union-py

In [2]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """
    For data grouped by feature, select subset of data at a provided key.
    """
    def __init__(self, column):
        self.column = column

    def fit(self, x, y=None):
        return self

    def transform(self, df):
        return df[self.column]

title feature

In [31]:
class title_featurer(BaseEstimator):
    """
    Class for creating custom title feature
    """

    def __init__(self):
        """
        Init custom feature
        """

        with open('constant_lists/section_lists_custom_dict.json') as f:
            label_lists = json.load(f)

        self.label_list = label_lists['methods']


    def fit(self, df, y=None):
        """
        fit method just returns the data
        """

        return self

    def transform(self, df):
        """
        See if the title is in the list jim gave me and give it a weight if so
        """

        title_vector = []
        pattern = r'(?<=<title>)(.*?)(?=<\/title>)'
        #for index, row in df.iterrows():
        for row in df:
            regex_search = re.search(pattern, row)
            #if regex_search:
                #print(regex_search.group(0))
            if regex_search.group(0) in self.label_list:
                title_vector.append(1)
            else:
                title_vector.append(0)

        X = np.array([title_vector]).T # need the transpose to convert from [1,32000] to [32000,1]

        return X
        #return title_vector

location feature

In [8]:
class location_featurer(BaseEstimator):
    """
    Class for creating custom location feature
    """

    def fit(self, df, y=None):
        """
        fit method just returns the data
        """

        return self

    def transform(self, column):
        """
        See if the title is in the list jim gave me and give it a weight if so
        """
        
        location_vector = column.round(1)
        # round location
        X = np.array([location_vector]).T # need the transpose to convert from [1,32000] to [32000,1]
        
        return X


In [36]:
client = pm.MongoClient('localhost', 27017)
db = client['full_texts'] # creates database if not there
#documents = db['texts'] # creates new collection if not there
documents = db['train_val_v2']
#df = pd.DataFrame(list(documents.find()))
#df.fillna(value = 'none', inplace = True)

df_conclusion = pd.DataFrame(list(documents.find({'label': 'conclusion'}).limit(10000)))
df_result = pd.DataFrame(list(documents.find({'label': 'result'}).limit(10000)))
df_method = pd.DataFrame(list(documents.find({'label': 'method'}).limit(10000)))
df_intro = pd.DataFrame(list(documents.find({'label': 'introduction'}).limit(10000)))

dfs = [df_conclusion, df_result, df_method, df_intro]

# merge the dfs!
merged_df = pd.concat(dfs)
merged_df.sort_values('date', inplace = True)

# drop short rows
merged_df = merged_df.loc[merged_df['text'].map(len) > 400]
merged_df.reset_index(inplace = True, drop = True)
print("df shape", merged_df.shape)

# Make data binary
pattern = r'^(?!method).*$'
merged_df['label'].replace(pattern, 'other', regex = True, inplace = True)

# clean punctuation...
#punctuation = r'[\*\<\>\?\.\$\!\(\)\@\#\%\^\-\+\{\}\[\]\,\\\/:;"\'\|]'

# see what difference the namespaces make... 
# none apparently
# texts = []
# for i, row in merged_df.iterrows():
#     try:
#         root = le.fromstring(row['text'])
#         le.cleanup_namespaces(root)
#         texts.append(le.tostring(root, encoding='unicode', method='xml'))
#     except le.XMLSyntaxError as e:
#         print(e)
#         texts.append(row['text'])
# merged_df['text'] = texts

# reset indices so I can use the indices from X_test and y_predictions
X_train = merged_df.loc[:31999, ['text', 'location']].reset_index(drop=True)
y_train = merged_df.loc[:31999, 'label'].reset_index(drop=True)
X_test = merged_df.loc[32000:, ['text', 'location']].reset_index(drop=True)
y_test = merged_df.loc[32000:, 'label'].reset_index(drop=True)


df shape (38909, 7)
Extra content at the end of the document, line 1, column 6094 (<string>, line 1)
Extra content at the end of the document, line 1, column 5840 (<string>, line 1)
Extra content at the end of the document, line 27, column 52 (<string>, line 27)
Extra content at the end of the document, line 1, column 1269 (<string>, line 1)
Extra content at the end of the document, line 1, column 4993 (<string>, line 1)
Extra content at the end of the document, line 19, column 57 (<string>, line 19)
Extra content at the end of the document, line 1, column 2882 (<string>, line 1)
Extra content at the end of the document, line 1, column 2241 (<string>, line 1)
Extra content at the end of the document, line 1, column 571 (<string>, line 1)
Extra content at the end of the document, line 1, column 1255 (<string>, line 1)
Extra content at the end of the document, line 1, column 4641 (<string>, line 1)
Extra content at the end of the document, line 1, column 630 (<string>, line 1)
Extra cont

small dataset for fun

In [7]:
merged_df = merged_df.sample(200)
X = merged_df.loc[:, ['text', 'location']]
y = merged_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, stratify = y)

well I want to use a location feature but I need a more complex pipeline

In [37]:
pipeline = Pipeline([
    # Use FeatureUnion to combine the features from subject and body
    ('union', FeatureUnion(
        transformer_list=[

            ('location_pipe', Pipeline([
                ('selector', ItemSelector(column='location')),
                ('location', location_featurer()),
            ])),

            ('title_pipe', Pipeline([
                ('selector', ItemSelector(column='text')),
                ('title', title_featurer()),
            ])),
            
            ('text_pipe', Pipeline([
                ('selector', ItemSelector(column='text')),
                ('tfidf', TfidfVectorizer()),  
            ])),
            
        ],
#     transformer_weights={
#         'titles': .8,
#         'location': 1.2,
#         'tfidf': .7
#         }
    )),

    # Use a SVC classifier on the combined features
    #('svcl', SVC(kernel='linear')),
    ('lg', LogisticRegression(random_state=0)),
])

test this pipe!

In [38]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_pred, y_test))

AttributeError: 'NoneType' object has no attribute 'group'