In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pickle

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/conawws1/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/conawws1/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/conawws1/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/conawws1/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
input_dir = "/data/p_dsi/capstone_projects/shea/4_merged/"
df = pd.read_pickle(input_dir + "merged_unstructured.pkl")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13082606 entries, 0 to 13082605
Data columns (total 4 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   vin              object
 1   status_date      object
 2   seller_comments  object
 3   listed_options   object
dtypes: object(4)
memory usage: 499.1+ MB


In [6]:
df.fillna("", inplace=True)

In [7]:
df["listed_options"] = df["listed_options"].apply(lambda x: " ".join(x))

In [10]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


In [11]:
def preprocess_text(text):
    # lowercase
    text = text.lower()

    # remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)

    # tokenize
    tokens = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # lemmatize
    lemmatizer = WordNetLemmatizer()
    tagged = nltk.pos_tag(tokens)
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged]

    # rejoin string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


## Needs Multi-Processing and Progress Reporting Implementation

In [None]:
# apply preprocessing to 'seller_comments'
seller_comments = df['seller_comments'].apply(preprocess_text)


In [None]:
# save to pickle
filename = input_dir + "seller_comments_processed.pkl"
with open(filename, "wb") as file:
    pickle.dump(seller_comments, file)
    

In [None]:
# apply preprocessing to 'listed_options'
listed_options = df['listed_options'].apply(preprocess_text)


In [None]:
# save to pickle
filename = input_dir + "listed_options_processed.pkl"
with open(filename, "wb") as file:
    pickle.dump(listed_options, file)
    