In [11]:
#----------------------------------------------
# Load dependencies
#----------------------------------------------
import pandas as pd
import base64
import re
import numpy as np
import string
import unicodedata
import nltk
import os
import streamlit as st
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

nltk.download('stopwords')


#----------------------------------------------
# Define variables
#----------------------------------------------

# English stopwords
stopwords_en = nltk.corpus.stopwords.words('english')

# French stopwords
stopwords_fr = nltk.corpus.stopwords.words('french')

#----------------------------------------------
# Define functions
#----------------------------------------------

# Helper function
def clean_cols(cols):
    cols_clean1 = cols.str.replace(' ', '_').str.replace('.', '')
    cols_clean2 = cols_clean1.str.replace('.', '') 
    cols_clean3 = [col.lower() for col in cols_clean2]
    return cols_clean3

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\domen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Read in teams & accounts CSVs
df = pd.read_csv('C:/Users/domen/github/pokemon_classification/assets/style_names.csv')
df = df[["MASTER_STYLE", "STYLE_NAME", "UNIQUE_SKU"]]
df.head(4)

Unnamed: 0,MASTER_STYLE,STYLE_NAME,UNIQUE_SKU
0,ENERGY BRA,ENERGY BRA,2084.0
1,DEFINE JACKET,DEFINE JACKET,1469.0
2,ALIGN PANT II,ALIGN PANT II - ES EC - DNU AFTER SP20,333.0
3,FLY AWAY TAMER HEADBAND,FLY AWAY TAMER HEADBAND II,123.0


In [15]:
# Clean data

# clean column names
df.columns = clean_cols(df.columns)

# convert columns with string values to lower
df['master_style'] = df.master_style.str.lower()
df['style_name'] = df.style_name.str.lower()
df.head(4)

  cols_clean1 = cols.str.replace(' ', '_').str.replace('.', '')
  cols_clean2 = cols_clean1.str.replace('.', '')


Unnamed: 0,master_style,style_name,unique_sku
0,energy bra,energy bra,2084.0
1,define jacket,define jacket,1469.0
2,align pant ii,align pant ii - es ec - dnu after sp20,333.0
3,fly away tamer headband,fly away tamer headband ii,123.0


In [32]:
# Text analytics functions

def round1_text_clean(text):
    text = ' ' + text # added space because there was some weirdness for first word (strip later)
    text = text.lower() # convert all text to lowercase
    text = re.sub(r'(\s)@\w+', '', text) # remove whole word if starts with @
    text = re.sub(r'(\s)\w*\d\w*\w+', '', text) # remove whole word if starts with number
    text = re.sub(r'https\:\/\/t\.co\/*\w*', '', text) # remove https links
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # removes punctuation
    text = re.sub('\[.*?\]', '', text) # removes text in square brackets
    text = text.replace(" ins ", '') # replace apostrophes with empty string
    #text = re.sub('\w*\d\w*', '', text) # remove whole word if starts with number
    #text = re.sub(r'(\s)#\w+', '', text) # remove whole word if starts with #
    text = text.strip() # strip text
    return text

# Function 7b
#-------------
text_clean_round1 = lambda x: round1_text_clean(x)

# Function 8
#-------------
def text_clean_round2(text):
    """
    A simple function to clean up the data. All the words that
    are not designated as a stop word is then lemmatized after
    encoding and basic regex parsing are performed.
    """
    nltk.download('wordnet')
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore'))
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

# Function 9
#-------------
def text_clean_round3(text):
    #TODO: add emoticons and emojis to this!
    # Load in stopwords
    stopwords_en = nltk.corpus.stopwords.words('english')
    stopwords_fr = nltk.corpus.stopwords.words('french')
    stopwords = stopwords_en + stopwords_fr
    # Create pre-clean character count feature
    text = text.apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
    return text

# Function 10a
#-----------------
def tweets_ngrams(n, top_n, df):
    """
    Generates series of top ngrams
    n: number of words in the ngram
    top_n: number of ngrams with highest frequencies
    """
    text = df.text
    words = text_clean_round2(''.join(str(text.tolist())))
    result = (pd.Series(data = nltk.ngrams(words, n), name = 'frequency').value_counts())[:top_n]
    return result.to_frame()

# Function 10b
#-----------------
def all_ngrams(top_n, df):
    text = df.text
    words = text_clean_round2(''.join(str(text.tolist())))
    unigram = ((pd.Series(data = nltk.ngrams(words, 1), name = 'freq').value_counts())[:top_n]).to_frame()
    unigram['ngram'] = 'unigram'
    bigram = ((pd.Series(data = nltk.ngrams(words, 2), name = 'freq').value_counts())[:top_n]).to_frame()
    bigram['ngram'] = 'bigram'
    trigram = ((pd.Series(data = nltk.ngrams(words, 3), name = 'freq').value_counts())[:top_n]).to_frame()
    trigram['ngram'] = 'trigram'
    result = unigram.append([bigram, trigram])
    result['ngram_nm'] = result.index
    return result

In [38]:
# create a new column from master_style called text
df['master_style'] = df.master_style.astype(str)
df['text'] = df.master_style

# prepare text for text cleaning (remove punctuation, numbers, etc.)
df['text'] = df.text.apply(text_clean_round1)
df['text'] = text_clean_round3(df.text)

df[df.master_style.str.contains('ins')].head(50)

Unnamed: 0,master_style,style_name,unique_sku,text,clean_text
7,wunder train tight 25 ins,wunder train hr tight 25in,594.0,wunder train tight ins,wunder train tight ins
70,wunder train short 6 ins,wunder train hr short 6in,380.0,wunder train short 6 ins,wunder train short 6 ins
149,wunder train crop 21 ins,wunder train hr crop 21in,268.0,wunder train crop ins,wunder train crop ins
180,wunder train short 8 ins,wunder train hr short 8in,264.0,wunder train short 8 ins,wunder train short 8 ins
211,wunder train crop 23 ins,wunder train hr crop 23in,317.0,wunder train crop ins,wunder train crop ins
233,wunder train tight 28 ins,wunder train hr tight 28in,193.0,wunder train tight ins,wunder train tight ins
714,inspire tight ii,inspire tight ii,396.0,inspire tight ii,inspire tight ii
903,instill hr tight 25in,instill hr tight 25in,40.0,instill hr tight,instill hr tight
946,wunder under crop hr 23 inseam,wunder under crop hr 23 inseam fe fullux,69.0,wunder crop hr inseam,wunder under crop hr inseam
997,wunder train short 4 ins,wunder train hr short 4in,60.0,wunder train short 4 ins,wunder train short 4 ins


In [36]:
all_ngrams(1000, df).head(50)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\domen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,freq,ngram,ngram_nm
"(short,)",1712,unigram,"(short,)"
"(tank,)",1208,unigram,"(tank,)"
"(pant,)",1178,unigram,"(pant,)"
"(tight,)",983,unigram,"(tight,)"
"(crop,)",975,unigram,"(crop,)"
"(tech,)",972,unigram,"(tech,)"
"(jacket,)",887,unigram,"(jacket,)"
"(run,)",875,unigram,"(run,)"
"(l,)",837,unigram,"(l,)"
"(s,)",709,unigram,"(s,)"
