In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC


In [2]:
data = pd.read_csv("data.csv")
data.head()
data.fillna(data['Category '].mode)

data.shape

(759, 2)

In [3]:
X_train = data["Full SKU Name"]
y_train = data["Category "]

In [4]:
y_train.isnull().sum()

0

In [5]:
y_train.unique()

array(['Hair Care', 'Skin Care', 'Men Expert', 'Eye Makeup',
       'Face Makeup', 'Lip Makeup'], dtype=object)

In [6]:
import re
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))

def getCleanText(text):
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$","", text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = str(text).lower()
    tokens = tokenizer.tokenize(text)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    clean_text = " ".join(new_tokens)
    return clean_text

In [7]:
x_clean = [getCleanText(i) for i in X_train]

VECTORIZATION

In [8]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')


In [9]:
x_vec = tfidf.fit_transform(x_clean).toarray()
x_vec.shape

(759, 413)

MODEL

In [10]:
model = LinearSVC().fit(x_vec, y_train)

TESTING

In [11]:
#First Clean the text data to be tested

test = pd.read_csv("sales_2021-08-01_2021-09-24.csv")
test.head()

Unnamed: 0,product_title,product_vendor,product_type,variant_sku,net_quantity,gross_sales,discounts,returns,net_sales,taxes,total_sales
0,L'Oreal Paris- Rouge Signature 116 Explore,L'Oreal CPD,L'Oreal Paris Makeup,6.9024E+12,11,16246.5,-8921.5,0.0,7325.0,0,7325.0
1,L'Oreal Paris- Haircare Gala - Dream Long Bund...,L'Oreal CPD,LOSC,bd8833,-1,0.0,0.0,-999.0,-999.0,0,-999.0
2,L'Oreal Paris- Les Chocolats Lipstick- 868 Cac...,L'Oreal CPD,L'Oreal Paris Makeup,3.60052E+12,1,1250.0,0.0,0.0,1250.0,0,1250.0
3,NYX Professional Makeup- Total Control Drop Fo...,L'Oreal CPD,NYX Professional Makeup,8.00897E+11,4,7286.4,-2115.72,0.0,5170.68,0,5170.68
4,NYX Professional Makeup Liquid Suede Cream Lip...,L'Oreal CPD,NYX Professional Makeup,8.00898E+11,25,17208.15,-839.0,0.0,16369.15,0,16369.15


In [12]:
test_title=test['product_title']


In [13]:
# text = ["Maybelline New York- Clearsmooth All In One Two Way Cake 02 Nude Beige - Refill"]
test_clean = [getCleanText(i) for i in test_title]
test_vec = tfidf.transform(test_clean).toarray()

In [14]:
test['pred'] = model.predict(test_vec)

In [15]:
test

Unnamed: 0,product_title,product_vendor,product_type,variant_sku,net_quantity,gross_sales,discounts,returns,net_sales,taxes,total_sales,pred
0,L'Oreal Paris- Rouge Signature 116 Explore,L'Oreal CPD,L'Oreal Paris Makeup,6.9024E+12,11,16246.50,-8921.50,0.0,7325.00,0,7325.00,Face Makeup
1,L'Oreal Paris- Haircare Gala - Dream Long Bund...,L'Oreal CPD,LOSC,bd8833,-1,0.00,0.00,-999.0,-999.00,0,-999.00,Hair Care
2,L'Oreal Paris- Les Chocolats Lipstick- 868 Cac...,L'Oreal CPD,L'Oreal Paris Makeup,3.60052E+12,1,1250.00,0.00,0.0,1250.00,0,1250.00,Lip Makeup
3,NYX Professional Makeup- Total Control Drop Fo...,L'Oreal CPD,NYX Professional Makeup,8.00897E+11,4,7286.40,-2115.72,0.0,5170.68,0,5170.68,Face Makeup
4,NYX Professional Makeup Liquid Suede Cream Lip...,L'Oreal CPD,NYX Professional Makeup,8.00898E+11,25,17208.15,-839.00,0.0,16369.15,0,16369.15,Face Makeup
...,...,...,...,...,...,...,...,...,...,...,...,...
974,Maybelline New York- Eye Studio Gel Liner 24H ...,LOreal CPD,Maybelline New York,6.9024E+12,1,1329.30,0.00,0.0,1329.30,0,1329.30,Eye Makeup
975,LOreal Paris- La Vie En Glow Highlighting Powd...,LOreal CPD,LOreal Paris Makeup,3.60052E+12,1,2279.00,-209.99,0.0,2069.01,0,2069.01,Face Makeup
976,Maybelline New York Master Sculpt 02 Medium/Dark,LOreal CPD,Maybelline New York,3.60053E+12,1,1104.35,0.00,0.0,1104.35,0,1104.35,Face Makeup
977,NYX Professional Makeup- Soft Matte Lip Cream ...,LOreal CPD,NYX Professional Makeup,8.00898E+11,1,734.00,-102.76,0.0,631.24,0,631.24,Lip Makeup
