In [1]:
# download necessary NLTK data
import nltk
nltk.download(['punkt', 'wordnet'])

# import statements
import re
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from custom_transformer import StartingVerbExtractor

[nltk_data] Downloading package punkt to /Users/ryanneal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ryanneal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ModuleNotFoundError: No module named 'custom_transformer'

In [None]:
def load_data():
    df = pd.read_csv('corporate_messaging.csv', encoding='latin-1')
    df = df[(df["category:confidence"] == 1) & (df['category'] != 'Exclude')]
    X = df.text.values
    y = df.category.values
    return X, y

In [None]:
url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

In [None]:
def tokenize(text):
    # get list of all urls using regex
    detected_urls = re.findall(url_regex, text)
    
    # replace each url in text string with placeholder
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # tokenize text
    tokens = word_tokenize(text)
    
    # initiate lemmatizer
    lemmatizer = wordnet_lemmatizer = WordNetLemmatizer()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

<h2> Machine Learning Model </h2>

In [None]:
def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [None]:
def manual_main():
    # load data into model
    X, y = load_data()

    # perform train test split. Save 30 percent of data for testing.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
    
    # initialize objects we need
    vect = CountVectorizer(tokenizer=tokenize) 
    tfidf = TfidfTransformer()
    clf = RandomForestClassifier()
    
    # get word counts using count vectorizer
    X_train_word_count = vect.fit_transform(X_train)
    # calculate word frequency using tfidf and count vector results
    tfidf_train = tfidf.fit_transform(X_train_word_count)
    # fit with a random forest model
    clf.fit(tfidf_train, y_train)
    
    # Transform test data
    X_test_word_count = vect.transform(X_test)
    tfidf_test = tfidf.transform(X_test_word_count)
    #predict y value with random forest model
    y_pred = clf.predict(tfidf_test)
    
    display_results(y_test, y_pred)

In [None]:
def model_pipeline():
    # build pipeline
    pipeline = Pipeline([("features", FeatureUnion(
        [
            ("nlp_pipeline", Pipeline([
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
            ])),
            ("text_len", TextLengthExtractor()),
        ])),
    ("clf", RandomForestClassifier())])

In [16]:
def main():
    X, y = load_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = model_pipeline()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    display_results(y_test, y_pred)

In [17]:
main()

NameError: name 'TextLengthExtractor' is not defined