In [19]:
import pandas as pd
import csv
import re
from html.parser import HTMLParser
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# descripton cleaning helper functions

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def remove_email(desc):
    match = re.search(r'[\w\.-]+@[\w\.-]+', desc)
    
    if match is not None:
        email = match.group(0)
        # print(f'EMAIL={email}')
        return desc.replace(email,'')
    else:
        # print('NO EMAIL IN DESCRIPTION')
        return desc

def clean_description(html):
    #desc = unidecode.unidecode(html)
    desc = unicodedata.normalize('NFKD', html)
    
    # remove email
    desc = remove_email(desc)
    
    # replace linebreaks and paragraph tags with space
    desc = re.sub('<\\s*br\\s*>', ' ', desc)
    desc = re.sub('<\\s*br\\s*/\\s*>', ' ', desc)
    desc = re.sub('<\\s*p\\s*>', ' ', desc)
    desc = re.sub('<\\s*p\\s*/\\s*>', ' ', desc)
    
    # parse and remove other HTML tags / symbol entities
    desc = strip_tags(desc)
    
    # replace all non-alphanumeric characters
    # may or may not bite us in the ass; see how it goes
    desc = re.sub('[^a-zA-Z0-9]', ' ', desc)
    
    # lowercase result string
    desc = desc.lower()
    
    # return cleaned string with extra spaces removed
    return ' '.join(desc.split())   
    

In [3]:
# misc properties before running code

# increases text displayed in pandas table for Jupyter Notebook
pd.options.display.max_colwidth = 10000

In [4]:
# read csv into pandas DataFrame
orig_data = pd.read_csv('sample.csv', keep_default_na=False)

In [5]:
# clean descriptions (takes some time)
orig_data['description'] = orig_data['description'].apply(clean_description)

In [41]:
# CountVectorizer expects a list/iterable, so change string into a one-element list
input = [orig_data.loc[0,'description']]
vectorizer = CountVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,1))
foo = vectorizer.fit_transform(input)
vectorizer._validate_vocabulary()
# gets unique words
bar = vectorizer.get_feature_names()
# gets list of list of count of each word (1 list of list with 305 numbers)
baz = foo.toarray()
baz = baz[0]
print(bar)
print(len(bar))
print(baz)
print(len(baz))

['10', '50lbs', '74011', '74012', '74063', '74104', '74105', '74107', '74110', '74112', '74114', '74115', '74127', '74129', '74133', '74135', '74136', '74137', '74145', '74146', '8801', 'abilities', 'ability', 'accommodation', 'accounts', 'achievement', 'additional', 'address', 'adhere', 'aiding', 'analytical', 'application', 'applications', 'areas', 'assigned', 'attention', 'automobile', 'background', 'based', 'belief', 'believe', 'bend', 'better', 'box', 'brand', 'businesses', 'career', 'challenges', 'check', 'cleaned', 'codes', 'coinstar', 'collects', 'com', 'communicates', 'communication', 'communications', 'commute', 'company', 'complete', 'computer', 'conflict', 'contact', 'contribute', 'cooperates', 'counts', 'create', 'creative', 'customer', 'customers', 'cutters', 'daily', 'degree', 'demonstrate', 'demonstrates', 'developing', 'diploma', 'diplomatically', 'directed', 'direction', 'disabilities', 'distributed', 'diverse', 'diversity', 'driving', 'drug', 'dynamic', 'ecoatm', 'ed