In [1]:
import pandas as pd
import csv
import re
from html.parser import HTMLParser
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [2]:
# descripton cleaning helper functions

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def remove_email(desc):
    match = re.search(r'[\w\.-]+@[\w\.-]+', desc)
    
    if match is not None:
        email = match.group(0)
        # print(f'EMAIL={email}')
        return desc.replace(email,'')
    else:
        # print('NO EMAIL IN DESCRIPTION')
        return desc

def clean_description(html):
    #desc = unidecode.unidecode(html)
    desc = unicodedata.normalize('NFKD', html)
    
    # remove email
    desc = remove_email(desc)
    
    # replace linebreaks and paragraph tags with space
    desc = re.sub('<\\s*br\\s*>', ' ', desc)
    desc = re.sub('<\\s*br\\s*/\\s*>', ' ', desc)
    desc = re.sub('<\\s*p\\s*>', ' ', desc)
    desc = re.sub('<\\s*p\\s*/\\s*>', ' ', desc)
    
    # parse and remove other HTML tags / symbol entities
    desc = strip_tags(desc)
    
    # replace all non-alphanumeric characters
    # may or may not bite us in the ass; see how it goes
    desc = re.sub('[^a-zA-Z0-9]', ' ', desc)
    
    # lowercase result string
    desc = desc.lower()
    
    # return cleaned string with extra spaces removed
    return ' '.join(desc.split())

In [3]:
# increases text displayed in pandas table for Jupyter Notebook
pd.options.display.max_colwidth = 10000

# read csv into pandas DataFrame (sample.csv = 100000)
df = pd.read_csv('sample.csv', keep_default_na=False)

# remove listings where description is blank (100000 -> 99680)
df = df[df['description'] != '']

# clean descriptions (takes some time) and remove blanks after cleaning (99680 -> 99641)
df['cleaned'] = df['description'].apply(clean_description)
df = df[df['cleaned'] != '']

In [4]:
# 70:30 train-test split; 99641 * 0.7 ~ 69750
df_train = df[0:69750]
df_test = df[69750:]

# training description and label
df_train_desc = df_train['cleaned']
df_train_label = df_train['onet']

# testing description and label
df_test_desc = df_test['cleaned']
df_test_label = df_test['onet']

In [5]:
# initialize count vectorizer
count_vectorizer = CountVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,1))
df_train_cv = count_vectorizer.fit_transform(df_train_desc)

# initialize tf transformer
tf_transformer = TfidfTransformer(use_idf=False)
df_train_tf = tf_transformer.fit_transform(df_train_cv)

# initialize tf-idf transformer
tfidf_transformer = TfidfTransformer(use_idf=True)
df_train_tfidf = tfidf_transformer.fit_transform(df_train_cv)

In [6]:
# test naive bayes classification
clf = MultinomialNB().fit(df_train_tfidf, df_train_label)

In [7]:
#df_test_tf = get_test_tf_transformer(df_train_cv, df_train_tfidf, df_test['cleaned'])
df_test_cv = count_vectorizer.transform(df_test_desc)
df_test_tf = tf_transformer.fit_transform(df_test_cv)
df_test_tfidf = tfidf_transformer.fit_transform(df_test_cv)

In [8]:
predicted = clf.predict(df_test_tfidf)

In [9]:
predicted

array(['15-1031.00', '41-2031.00', '15-1031.00', ..., '11-2021.00',
       '41-2031.00', '53-3032.00'], dtype='<U10')

In [10]:
score = clf.score(df_test_tfidf, df_test_label)
score

0.33916563514101233