# Imports Always First

In [51]:
import os
import pandas as pd
import re
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [52]:
os.listdir('..\data')

['.ipynb_checkpoints', 'Corporate-messaging-DFE.csv']

# Data Extract

In [53]:
def extract_data():
    df = pd.read_csv('..\data\Corporate-messaging-DFE.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

# Custom Tokenizer

In [54]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

# Pipeline

In [70]:
def main():
    X, y = extract_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    pipeline = Pipeline(steps=[('vect', CountVectorizer(tokenizer=tokenize)), 
                               ('tfidf', TfidfTransformer()), 
                               ('clf', RandomForestClassifier(n_estimators=100))])
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    labels = np.unique(y_pred)
    index_labels = {i: x for i, x in enumerate(labels)}
    display(pd.DataFrame(confusion_matrix(y_test, y_pred, labels=labels), columns=labels).rename(index=index_labels))
    print(f"Prediction Score: {(y_pred == y_test).mean():2.2%}")

# Evaluation

In [71]:
main()

Unnamed: 0,Action,Dialogue,Information
Action,81,0,24
Dialogue,0,26,6
Information,3,0,461


Prediction Score: 94.51%
