### Machine Learning / NLP - Pipeline

In [1]:
import re
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV

In [2]:
nltk.download(['punkt', 'wordnet'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\preks\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\preks\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

**Corporate message data**

In [3]:
data = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
data[['category', 'text']].head(10)

Unnamed: 0,category,text
0,Information,Barclays CEO stresses the importance of regula...
1,Information,Barclays announces result of Rights Issue http...
2,Information,Barclays publishes its prospectus for its å£5....
3,Information,Barclays Group Finance Director Chris Lucas is...
4,Information,Barclays announces that Irene McDermott Brown ...
5,Information,Barclays response to PRA capital shortfall exe...
6,Information,Barclays sponsors #Zamynforum BBC World Servic...
7,Information,Barclays has today published its response to T...
8,Action,Read statement on #Barclays CEO bonus award 20...
9,Action,59% of workers are either looking to change jo...


**Machine Learning Pipeline**

In [4]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'


def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y


def tokenize(text):
    '''Replace URLs and tokenize'''
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens
    

def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [5]:
def main():
    # load data
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # build pipeline
    pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
    ])

    # train classifier
    pipeline.fit(X_train, y_train)
    
    # predict on test data
    y_pred = pipeline.predict(X_test)
    
    # display results
    display_results(y_test, y_pred)

In [6]:
# run program
main()

Labels: ['Action' 'Dialogue' 'Information']
Confusion Matrix:
 [[ 83   0  28]
 [  0  22   4]
 [  2   1 461]]
Accuracy: 0.9417637271214643
