In [2]:
descriptions = ["Here are some very simple basic sentences.",
"They won't be very interesting, I'm afraid.",
"The point of these examples is to _learn how basic text cleaning works_ on *very simple* data."] # change to df.description, current code is for proof of concept

In [None]:
# Convert text column to lowercase
df['description'] = df['description'].str.lower()
df['title'] = df['title'].str.lower()
df.description.value_counts()
df.title.value_counts()

In [3]:
# Tokenizing text into bags of words
from nltk.tokenize import word_tokenize
tokenized_descs = [word_tokenize(desc) for desc in descriptions]
print(tokenized_descs)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['They', 'wo', "n't", 'be', 'very', 'interesting', ',', 'I', "'m", 'afraid', '.'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', '_learn', 'how', 'basic', 'text', 'cleaning', 'works_', 'on', '*', 'very', 'simple', '*', 'data', '.']]


In [5]:
# Removing punctuation
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html

tokenized_descs_no_punctuation = []

for review in tokenized_descs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    
    tokenized_descs_no_punctuation.append(new_review)
    
print(tokenized_descs_no_punctuation)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'be', 'very', 'interesting', 'I', 'm', 'afraid'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', 'learn', 'how', 'basic', 'text', 'cleaning', 'works', 'on', 'very', 'simple', 'data']]


In [10]:
# Stemming and Lemmatizing
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

preprocessed_descs = []

for descriptions in tokenized_descs_no_stopwords:
    final_desc = []
    for desc in descriptions:
        final_desc.append(porter.stem(desc))
        #final_desc.append(snowball.stem(desc))
        #final_desc.append(wordnet.lemmatize(desc))
    
    preprocessed_descs.append(final_desc)

print(preprocessed_descs)

[['here', 'simpl', 'basic', 'sentenc'], ['they', 'wo', 'nt', 'interest', 'I', 'afraid'], ['the', 'point', 'exampl', 'learn', 'basic', 'text', 'clean', 'work', 'simpl', 'data']]


In [None]:
#easier clean text function
space_replace = re.compile('[/(){}\[\]\|@,;]')
symbols = re.compile('[^0-9a-z #+_]')
stopwords = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()
def clean_text(text):
    """
        text: a string
        return: modified initial string
    """
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = space_replace.sub(' ', text) # replace symbols by space in text
    text = symbols.sub('', text) # delete symbols which are in symbols from text
    text = text.append(porter.stem(text)) ## not sure if we need to use this one, can adjust stemmer type
    text = word_tokenize(text) ## not sure if we need this or not
    text = ' '.join(word for word in text.split() if word not in stopwords) # delete stopwords from text
    return text

In [None]:
df['description'] = df['description'].apply(clean_text)
df['title'] = df['title'].apply(clean_text)

In [None]:
df_description.set_index('name', inplace = True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_description['sentence'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(df_description.index)
def recommendations(name, cosine_similarities = cosine_similarities):
    recommended_product_codes = []
    # gettin the index of the codes that matches the name
    idx = indices[indices == name].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)

    # getting the indexes of the 5 most similar codes except itself
    top_10_indexes = list(score_series.iloc[1:6].index)
    
    # populating the list with the names of the top 5 matching codes
    for i in top_10_indexes:
        recommended_product_codes.append(list(df_description.index)[i])
        
    return recommended_product_codes