# Imports Always First

In [7]:
import os
import nltk
import pandas as pd
import re
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

In [8]:
os.listdir('..\data')

['.ipynb_checkpoints', 'Corporate-messaging-DFE.csv']

# Data Extract

In [9]:
def extract_data():
    df = pd.read_csv('..\data\Corporate-messaging-DFE.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

# Custom Tokenizer

In [10]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

# Pipeline + Feature Union

For a feature union to work, the machine learning algorithm needs to be at the same level pipeline as the Feature Union. Any custom transformations need to be added to the transformer_list.

In [11]:
class SentLen(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.DataFrame(pd.Series(X).apply(len))

In [13]:
def main():
    X, y = extract_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    pipeline = Pipeline(
        steps=[
            ('features', FeatureUnion(
                transformer_list=[
                    ('text_pipeline', Pipeline(
                        steps=[
                            ('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer())
                        ])),
                    ('sentlen', SentLen())
                ])),
            ('clf', RandomForestClassifier(n_estimators=100))
        ])    
    
    parameters = {
        'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
        'features__text_pipeline__vect__max_features': (None, 5000, 10000),
        'features__text_pipeline__tfidf__use_idf': (True, False),
        'clf__n_estimators': [10, 100, 200],
        'clf__min_samples_split': [2, 3, 4],
        'features__transformer_weights': (
            {'text_pipeline': 1, 'sentlen': 0.5}, 
            {'text_pipeline': 0.5, 'sentlen': 1}, 
            {'text_pipeline': 0.8, 'sentlen': 1})
    }
    
    cv = GridSearchCV(pipeline, parameters)
    
    cv.fit(X_train, y_train)
    
    y_pred = cv.predict(X_test)
    
    labels = np.unique(y_pred)
    index_labels = {i: x for i, x in enumerate(labels)}
    display(pd.DataFrame(confusion_matrix(y_test, y_pred, labels=labels), columns=labels).rename(index=index_labels))
    print(f"Prediction Score: {(y_pred == y_test).mean():2.2%}")
    print("\nBest Parameters: ", cv.best_params_)

# Evaluation

In [14]:
main()



Unnamed: 0,Action,Dialogue,Information
Action,88,0,14
Dialogue,1,24,4
Information,5,0,465


Prediction Score: 96.01%

Best Parameters:  {'clf__min_samples_split': 2, 'clf__n_estimators': 100, 'features__text_pipeline__tfidf__use_idf': True, 'features__text_pipeline__vect__max_df': 0.5, 'features__text_pipeline__vect__max_features': None, 'features__text_pipeline__vect__ngram_range': (1, 1), 'features__transformer_weights': {'text_pipeline': 0.8, 'sentlen': 1}}
