## Cleaning Data

In [31]:
import pandas as pd
import numpy as np
df = pd.read_csv(r'C:\Users\Haley\OneDrive\Desktop\Purchase_Order_Data.csv', nrows = 10000) # change file location and file name
#df = df[['Item Name', 'Item Description', 'Class']]

In [32]:
# concatenate title and product description
df['title_desc'] = df['Item Name'] + ' '+ df['Item Description']
df.dropna(subset=['Class'], inplace=True) # removed rows with null values in 'Class' column, change later to product_code
df = df.astype({"title_desc":'str', "Item Name":'str', 'Item Description': 'str'}) # changing data types so clean_text function works properly
df.head()

Unnamed: 0,Creation Date,Purchase Date,Fiscal Year,LPA Number,Purchase Order Number,Requisition Number,Acquisition Type,Sub-Acquisition Type,Acquisition Method,Sub-Acquisition Method,...,Normalized UNSPSC,Commodity Title,Class Title,Family,Family Title,Segment,Segment Title,Location,REMOVE AMERISOURCE,title_desc
17,9/5/2012,8/31/2012,2012-2013,,4500149558,,NON-IT Goods,,Formal Competitive,,...,50405625.0,Jalapeno peppers,Peppers,50400000.0,Fresh vegetables,50000000.0,Food Beverage and Tobacco Products,"93706\n(36.675079, -119.865393)",,JALAPENO JALAPENO
18,10/18/2012,10/18/2012,2012-2013,,4500156192,,NON-IT Goods,,Informal Competitive,,...,50301541.0,Ida red apples,Apples,50300000.0,Fresh fruits,50000000.0,Food Beverage and Tobacco Products,"91360\n(34.210392, -118.874313)",,"produce red apples, banana,"
19,10/17/2012,10/17/2012,2012-2013,,4500156124,,NON-IT Goods,,Fair and Reasonable,,...,55101506.0,Magazines,Printed publications,55100000.0,Printed media,55000000.0,Published Products,"95827\n(38.563097, -121.328511)",,magazine magazine
20,10/30/2012,10/24/2012,2012-2013,,S2556056,,NON-IT Goods,,Informal Competitive,,...,10121505.0,Hay,Livestock feed,10120000.0,Animal feed,10000000.0,Live Plant and Animal Material and Accessories...,,,hay Bale's of rice straw
21,11/14/2012,11/14/2012,2012-2013,,4500159228,,NON-IT Goods,,Informal Competitive,,...,50201706.0,Coffee,Coffee and tea,50200000.0,Beverages,50000000.0,Food Beverage and Tobacco Products,"95696\n(38.43, -122.02)",,COFFEE INSTANT COFFEE AND BULK


In [33]:
# import libraries
import nltk
import re
import string as st
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer # or LancasterStemmer, RegexpStemmer, SnowballStemmer
default_stopwords = stopwords.words('english') # or any other list of your choice

# create clean_text function:
def clean_text(text):

    def remove_punct(text):
        return ("".join([ch for ch in text if ch not in st.punctuation]))
    
    def tokenize(text):
        text = re.split('\s+' ,text)
        return [x.lower() for x in text]

    def remove_stopwords(text):
        return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

    def stemming(text):
        ps = PorterStemmer()
        return [ps.stem(word) for word in text]

    def lemmatize(text):
        word_net = WordNetLemmatizer()
        return [word_net.lemmatize(word) for word in text]

    text = remove_punct(text) # remove punctuation
    text = tokenize(text) # tokenize
    text = remove_stopwords(text) # remove stopwords
    text = stemming(text) # stemming
    text = lemmatize(text) # lemmatization
    
    return text

In [34]:
df['clean_text'] = df['title_desc'].apply(lambda x : clean_text(x))
df.head()

Unnamed: 0,Creation Date,Purchase Date,Fiscal Year,LPA Number,Purchase Order Number,Requisition Number,Acquisition Type,Sub-Acquisition Type,Acquisition Method,Sub-Acquisition Method,...,Commodity Title,Class Title,Family,Family Title,Segment,Segment Title,Location,REMOVE AMERISOURCE,title_desc,clean_text
17,9/5/2012,8/31/2012,2012-2013,,4500149558,,NON-IT Goods,,Formal Competitive,,...,Jalapeno peppers,Peppers,50400000.0,Fresh vegetables,50000000.0,Food Beverage and Tobacco Products,"93706\n(36.675079, -119.865393)",,JALAPENO JALAPENO,"[jalapeno, jalapeno]"
18,10/18/2012,10/18/2012,2012-2013,,4500156192,,NON-IT Goods,,Informal Competitive,,...,Ida red apples,Apples,50300000.0,Fresh fruits,50000000.0,Food Beverage and Tobacco Products,"91360\n(34.210392, -118.874313)",,"produce red apples, banana,","[produc, red, appl, banana]"
19,10/17/2012,10/17/2012,2012-2013,,4500156124,,NON-IT Goods,,Fair and Reasonable,,...,Magazines,Printed publications,55100000.0,Printed media,55000000.0,Published Products,"95827\n(38.563097, -121.328511)",,magazine magazine,"[magazin, magazin]"
20,10/30/2012,10/24/2012,2012-2013,,S2556056,,NON-IT Goods,,Informal Competitive,,...,Hay,Livestock feed,10120000.0,Animal feed,10000000.0,Live Plant and Animal Material and Accessories...,,,hay Bale's of rice straw,"[hay, bale, rice, straw]"
21,11/14/2012,11/14/2012,2012-2013,,4500159228,,NON-IT Goods,,Informal Competitive,,...,Coffee,Coffee and tea,50200000.0,Beverages,50000000.0,Food Beverage and Tobacco Products,"95696\n(38.43, -122.02)",,COFFEE INSTANT COFFEE AND BULK,"[coffe, instant, coffe, bulk]"


In [30]:
def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [25]:
df['clean_sentences'] = df['clean_text'].apply(lambda x : return_sentences(x))
df.head()

Unnamed: 0,Item Name,Item Description,title_desc,clean_text,clean_sentences
17,JALAPENO,JALAPENO,jalapeno jalapeno,"[jalapeno, jalapeno]",jalapeno jalapeno
18,produce,"red apples, banana,","produce red apples, banana,","[produc, red, appl, banana]",produc red appl banana
19,magazine,magazine,magazine magazine,"[magazin, magazin]",magazin magazin
20,hay,Bale's of rice straw,hay bale's of rice straw,"[hay, bale, rice, straw]",hay bale rice straw
21,COFFEE,INSTANT COFFEE AND BULK,coffee instant coffee and bulk,"[coffe, instant, coffe, bulk]",coffe instant coffe bulk
