In [None]:
# !python3 -m spacy download xx_ent_wiki_sm

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
real = pd.read_excel('data/data.xlsx')
# Меняем названия на более приемлемые:

target_shops = [
    'АО "ТОРГОВЫЙ ДОМ "ПЕРЕКРЕСТОК"', 'ООО "АГРОТОРГ"', 'ООО "ЛЕНТА"',
   'АО "ДИКСИ ЮГ"', 'ООО "АШАН"', 'АО "ТАНДЕР"', 'ООО "БИЛЛА"', 'ООО "О`КЕЙ"',
    'ООО "АГРОАСПЕКТ"', 'ООО "АТАК"'
]

shops_rename = {
    'АО "ТОРГОВЫЙ ДОМ "ПЕРЕКРЕСТОК"': 'ПЕРЕКРЕСТОК',
    'ООО "АГРОТОРГ"': 'ПЯТЕРОЧКА',
    'ООО "ЛЕНТА"': 'ЛЕНТА',
    'АО "ДИКСИ ЮГ"': 'ДИКСИ',
    'ООО "АШАН"': 'АШАН',
    'АО "ТАНДЕР"': 'МАГНИТ',
    'ООО "БИЛЛА"': 'БИЛЛА',
    'ООО "О`КЕЙ"': 'О`КЕЙ',
    'ООО "АГРОАСПЕКТ"': 'ПЯТЕРОЧКА',
    'ООО "АТАК"': 'АТАК'
}

real = real[real['shop_name'].isin(target_shops)].copy()
real['shop_name'] = real['shop_name'].apply(lambda x: shops_rename[x])
real = real.drop_duplicates(subset=['name'])
real.sample(2)

Unnamed: 0,shop_name,name,quantity,price,sum
38000,ПЯТЕРОЧКА,3679026 Пиво ЖИГУЛЕВ.св.4% ж/б 0.45л,3.0,49.99,149.97
12598,ПЕРЕКРЕСТОК,3*1535 Апельсины 1кг,0.884,69.9,61.79


In [3]:
real.head()

Unnamed: 0,shop_name,name,quantity,price,sum
19,ДИКСИ,МИНТАЙ ФИЛЕ Б/К СВ/МОР П/ПАК 8,1.0,199.9,199.9
20,ДИКСИ,БЗМЖ МОЛОКО СТРАНА ВАСИЛЬКИ У/,1.0,49.99,49.99
21,ДИКСИ,СУШКИ ТАРАЛЛИНИ С ЧЕСНОКОМ 180,1.0,29.99,29.99
22,ДИКСИ,ЯЙЦО КУРИНОЕ СТОЛОВОЕ 1КАТЕГОР,1.0,63.99,63.99
23,ДИКСИ,ПЕЧЕНЬЕ ЮБИЛЕЙНОЕ ВИТАМИНИЗИРО,2.0,29.99,59.98


In [4]:
def find_weight_or_volume(name: str)-> str:
    """Находит вес или объём товара."""

    pattern1 = r'\d+((\.|\,|x|X|х|Х)?)\d*([а-я]|[А-Я])*$' # ищет вес в конце
    pattern2 = r'\d+\w+' #  ищет вес в середине
    pattern3 = r'(\d+ кг)|(\d+ г)'  # находит `1 кг` или `1 г`
    result = re.search(pattern1, name)
    if result:
        return result.group(0)
    result = re.search(pattern2, name)
    if result:
        return result.group(0)
    result = re.search(pattern3, name)
    return result.group(0) if result else ''

In [6]:
from dicts import (
    PRODUCTS,
    BRANDS,
    SLASH_PRODUCTS,
    BRANDS_WITH_NUMBERS
)
from typing import Optional


class NormalizeNames:
    """
    Normalize the description of the product: expand abbreviations,
    delete garbage words and characters for further recognition,
    remove english worlds, etc.
    Steps:
    1. Convert to lowercase;
    2. Delete all words including numbers;
    3. Delete all service characters;
    4. Delete words consisting of 1 or 2 characters;
    5. Find English brands using the dataset `brands_en.csv`;
    6. Delete words from blacklist and words in English;
    7. Replace words using `dicts.PRODUCTS`.
    
    Parameters
    ----------
    data_to_parse : pd.Series
        Text column with a description of the products to normalize.
        
    Attributes
    ----------
    df: pd.DataFrame
        
    """
    def __init__(self, data_to_parse: pd.Series):
        columns = ['name', 'name_norm', 'product_norm', 'brand_norm']
        self.df = pd.DataFrame(data_to_parse, columns=columns)
        
        self.blacklist = pd.read_csv('data/blacklist.csv')['name'].values
        self.brands = pd.read_csv('clean_data/brands_en.csv')['brand'].values


    @staticmethod
    def _remove_numbers(name: str) -> pd.Series:
        """Remove all words in product description which contain numbers."""
        
        brand = None
        # Find brands with numbers:
        for key, value in BRANDS_WITH_NUMBERS.items():
            if key in name:
                brand = value
                name = name.replace(key, '')
                break

        name = ' '.join(re.sub(r'\w*\d\w*', '', word) for word in name.split())
        return pd.Series([name, brand])
    
    @staticmethod
    def _remove_punctuation(name: str, brand: Optional[str]) -> pd.Series:
        """Remove all service characters in product description."""

        # Find abbreviations:
        for key, value in BRANDS.items():
            if key in name:
                brand = value
                name = name.replace(key, '')
                break

        product = None
        for key, value in SLASH_PRODUCTS.items():
            if key in name:
                product = value
                name = name.replace(key, ' ')
                break
        
        # Pattern: remove `-` after the sentence and remove almost all service chars
        pattern = r'((?<=\w)-+(?!\w))|([.,+!?%:№*/\(|\)])'
        name = re.sub(pattern, ' ', name).replace('  ', ' ')
        return pd.Series([name, product, brand])
    
    
    def find_en_brands(self, name: str, brand: Optional[str]) -> pd.Series:
        """Find English brands using the dataset `brands_en.csv`."""

        if not brand:
            for brand_en in self.brands:
                if brand_en in name:
                    brand = brand_en
                    name = name.replace(brand_en, '')
                    break

        return pd.Series([name, brand])
    
    @staticmethod
    def _remove_one_and_two_chars(name: str) -> str:
        """Remove words consisting of 1 or 2 characters."""

        return ' '.join(x for x in name.split() if len(x) > 2)

    
    def _remove_words_in_blacklist(self, name: str) -> str:
        """Remove words from blacklist."""

        return ' '.join(word for word in name.split() if word not in self.blacklist)
    
    @staticmethod
    def _replace_with_product_dict(name: str) -> str:
        """Replace words using `dicts.PRODUCTS`."""

        return ' '.join(PRODUCTS.get(word, word) for word in name.split())
        
    @staticmethod
    def _remove_all_english_words(name: str, brand: Optional[str]) -> pd.Series:
        """
        Remove all English words in the product description.
        We make the assumption that these words are a brand.
        """

        eng_brands = re.findall(r'\b([a-z]+)\b', name)
        name = re.sub(r'\b([a-z]+)\b', '', name)

        if eng_brands and not brand:
            return pd.Series([name, eng_brands])
        return pd.Series([name, brand])
    
    def normalize(self) -> pd.DataFrame:
        """
        Normalize the description of the product: expand abbreviations,
        delete garbage words and characters for further recognition,
        remove english worlds, etc.
        """

        self.df['name_norm'] = self.df['name'].str.lower()
        self.df[['name_norm', 'brand_norm']] = self.df['name_norm'].apply(self._remove_numbers)
        self.df[['name_norm', 'product_norm', 'brand_norm']] = self.df.apply(
            lambda x: self._remove_punctuation(
                x['name_norm'], x['brand_norm']
            ), axis=1
        )
        self.df['name_norm'] = self.df['name_norm'].apply(self._remove_one_and_two_chars)
        self.df[['name_norm', 'brand_norm']] = self.df.apply(
            lambda x: self.find_en_brands(
                x['name_norm'], x['brand_norm']
            ), axis=1
        )
        
        self.df['name_norm'] = self.df['name_norm'].apply(self._remove_words_in_blacklist)
        self.df['name_norm'] = self.df['name_norm'].apply(self._replace_with_product_dict)
        
        self.df[['name_norm', 'brand_norm']] = self.df.apply(
            lambda x: self._remove_all_english_words(x['name_norm'], x['brand_norm']), axis=1
        )
        return self.df

## TODO:
1. Дописать доку
2. Подумать, что можно сделать с англ. брендами

In [7]:
%%time
n = NormalizeNames(real['name'])
df = n.normalize()
# Wall time: 2min 18s

CPU times: user 39.2 s, sys: 391 ms, total: 39.6 s
Wall time: 39.7 s


In [8]:
df['name_norm'] == tmp

NameError: name 'tmp' is not defined

In [192]:
tmp

Unnamed: 0,name,name_norm,product_norm,brand_norm
19,МИНТАЙ ФИЛЕ Б/К СВ/МОР П/ПАК 8,минтай филе пак,,
20,БЗМЖ МОЛОКО СТРАНА ВАСИЛЬКИ У/,молоко страна васильки,,
21,СУШКИ ТАРАЛЛИНИ С ЧЕСНОКОМ 180,сушки тараллини,,
22,ЯЙЦО КУРИНОЕ СТОЛОВОЕ 1КАТЕГОР,яйцо куриное,,
23,ПЕЧЕНЬЕ ЮБИЛЕЙНОЕ ВИТАМИНИЗИРО,печенье юбилейное витаминизиро,,
...,...,...,...,...
74012,БЗМЖ Сыр Российский 1кг,сыр российский,,
74013,Колбаса вареная Докторская Ста,колбаса ста,,
74015,БЗМЖ Сырок Б.Ю. Александров Ка,сырок,,б.ю. александров
74017,Карбонад Велком к/в в/у 450г,карбонад велком,,


In [9]:
df.brand_norm.dropna().shape # 13020 -> 13719 -> 13726 -> 13778 -> 11344 (без русских брендов) -> 11366 -> 11356

(11356,)

In [11]:
df.product_norm.dropna().shape # 350 -> 350 -> 240(возможно кетч карта)

(230,)

In [10]:
from typing import Optional, NoReturn, List, Union
from itertools import combinations
from pymystem3 import Mystem


class Finder:
    """
    Searche and recognize the name, category and brand of a product
    from its description.
    Search is carried out in the collected datasets: `brands_ru.csv`,
    `products.csv`, `all_clean.csv`.
    
    Parameters
    ----------
    data_to_parse: Union[pd.DataFrame, str],
        Data in which product information will be recognized.
        If data is a pd.DataFrane, it must contain the following columns:
        ['name_norm', 'product_norm', 'brand_norm'].
        If the data is a string, it should be normalized:
        service characters removed, lowercase letters, etc.
        
    Attributes
    ----------
    mystem : class
        A Python wrapper of the Yandex Mystem 3.1 morphological
        analyzer (http://api.yandex.ru/mystem).
        See aslo `https://github.com/nlpub/pymystem3`.
    rus_brands : np.ndarray
        List of Russian brands.
    products : pd.DataFrame
        DataFrame of product names and categories.
    df : pd.DataFrame
        The copy of `data_to_parse`.

    Examples
    --------
    >>> product = 'Майонез MR.RICCO Провансаль 67% д/п 400'
    >>> finder = Finder(product)
    >>> finder.find_all()
    
    Notes
    -----
    You may be comfortable with the following resource:
    'https://receiptnlp.tinkoff.ru/'.
    """
    
    def __init__(self, data_to_parse: Union[pd.DataFrame, str]):
        self.mystem = Mystem()
        
        # DataFrames:
        self.rus_brands = pd.read_csv('clean_data/brands_ru.csv')['brand'].values
        self.products = pd.read_csv('clean_data/products.csv')
        self.product_db = pd.read_csv('clean_data/all_clean.csv')
        
        columns = ['name_norm', 'product_norm', 'brand_norm']
        if isinstance(data_to_parse, pd.DataFrame):
            self.data = data_to_parse[columns].copy()
        else:
            self.data = pd.DataFrame([[data_to_parse, None, None]], columns=columns)
        self.data['cat_norm'] = None  # Add new column
        
    def find_brands(self, name: str, brand: Optional[str] = None) -> pd.Series:
        """
        Find Russian brands using the dataset `brands_ru.csv`.
        For more accurate recognition, a combination of words in a
        different order is used.

        Parameters
        ----------
        name : str
            Product name.
        brand : str, optional (default=None)
            Product category.

        Returns
        -------
        pd.Series
           pd.Series([name, brand])
        """

        if name and not brand:
            names = set([f'{comb[0]} {comb[1]}' for comb in combinations(name.split(), 2)] + name.split())
            for rus_brand in self.rus_brands:
                if rus_brand in names:
                    name = name.replace(rus_brand, '').replace('  ', ' ').strip()
                    return pd.Series([name, rus_brand])
        return pd.Series([name, brand])

    @staticmethod
    def __remove_duplicate_word(arr: List[str]) -> List[str]:
        """
        Remove duplicates in words when one name is a  continuation
        of another: ['вода', 'вода питьевая'] --> ['вода питьевая'].
        
        Parameters
        ----------
        arr : List[str]
            List description of products in different variants.
        
        Returns
        -------
        arr : List[str]
            List description of products without duplicates.
        """

        if max([len(x.split()) for x in arr]) > 1:
            arr = sorted(arr, key=lambda x: len(x.split()))
            one_words = []
            for product in arr.copy():
                if len(product.split()) == 1:
                    one_words.append(product)
                else:
                    for word in one_words:
                        if word in product:
                            arr.remove(word)
        return arr

    def find_product(self, name: str, product: str, category: Optional[str]=None) -> pd.Series:
        """
        Find products name using the dataset `products.csv`.
        For more accurate recognition, a combination of words in a
        different order is used.

        Parameters
        ----------
        name : str
            Product name.
        product : str
            Product description.
        category : str, optional (default=None)
            Product category.

        Returns
        -------
        pd.Series
           pd.Series([name, product, category]) 
        """

        if name and not product:
            names = pd.DataFrame(set([f'{comb[0]} {comb[1]}' for comb in combinations(name.split(), 2)] + name.split()), columns=['product'])
            merge = self.products.merge(names)
            if len(merge):
                product = ', '.join(self.__remove_duplicate_word(merge['product'].values))
                category = merge['category'].value_counts().index[0]
        return pd.Series([name, product, category])
    
    def _use_mystem(self, name: str, product: str) -> str:
        """
        Use Yandex pymystem3 library to lemmatize words in product descriptions.
        I tried to use pymorphy, but the recognition quality got worse.

        Parameters
        ----------
        name : str
            Product name.
        product : str
            Product description.

        Returns
        -------
        str
            Product description after lemmatization.

        Notes
        -----
        See also `https://github.com/nlpub/pymystem3`.
        """

        if name and not product:
            name = ''.join(self.mystem.lemmatize(name)[:-1])
        return name
    
    def find_category(self, product: str, category: str) -> pd.Series:
        """
        Find a product category using the dataset `products.csv`.

        Parameters
        ----------
        product : str
            Product description.
        category : str
            Product category.

        Returns
        -------
        pd.Series
           pd.Series([product, category]) 
        """

        if product and not category:
            tmp = self.products[self.products['product'] == product]
            if len(tmp):
                category = tmp['category'].values[0]

        return pd.Series([product, category])


    def find_product_by_brand(self, product: str, brand: str, category: str) -> pd.Series:
        """
        If we were able to recognize the product brand,
        but could not recongize the product name,
        we can assign the most common product name for this brand.

        Parameters
        ----------
        product : str
            Product description.
        brand : str
            Product brand.
        category : str
            Product category.

        Returns
        -------
        pd.Series
           pd.Series([product, brand, category]) 
        """

        if brand and not product:
            single_brand_goods = self.product_db[self.product_db['Бренд'] == brand]
            if len(single_brand_goods):
                product = single_brand_goods['Продукт'].value_counts().index[0]
                category = single_brand_goods['Категория'].value_counts().index[0]

        return pd.Series([product, brand, category])
    
    
    def __print_logs(self, message:str, verbose: bool = False) -> None:
        """
        Print the number of recognized brands,
        categories and names of goods.
        """
        
        if verbose:
            _len = len(self.data)
            print(message)
            print(
                "Recognized brands: "
                f"{len(self.data['brand_norm'].dropna())}/{_len}, "
                f"products: {len(self.data['product_norm'].dropna())}/{_len}, "
                f"categories: {len(self.data['cat_norm'].dropna())}/{_len}",
                '-'*80, sep='\n', end='\n\n'
            )
    
    def find_all(self,* , verbose: int = 0) -> None:
        """
        Start search and recognition in `data_to_parse`.

        Parameters
        ----------
        verbose: int (default=0)
            Set verbose to any positive number for verbosity.
        """

        self.__print_logs('Before:', verbose)
        
        # Find brands:
        self.data[['name_norm', 'brand_norm']] = self.data.apply(lambda x: self.find_brands(x['name_norm'], x['brand_norm']), axis=1)
        self.__print_logs('Find brands:', verbose)
        
        # Find product and category:
        self.data[['name_norm', 'product_norm', 'cat_norm']] = self.data.apply(lambda x: self.find_product(x['name_norm'], x['product_norm']), axis=1)
        self.__print_logs('Find product and category:', verbose)
        
        # Remove `-`:
        self.data['name_norm'] = self.data['name_norm'].str.replace('-', ' ')
        self.data[['name_norm', 'product_norm', 'cat_norm']] = self.data.apply(lambda x: self.find_product(x['name_norm'], x['product_norm'], x['cat_norm']), axis=1)
        self.__print_logs('Remove `-` and the second attempt to find a product:', verbose)
        
        # Use Mystem:
        self.data['name_norm'] = self.data.apply(lambda x: self._use_mystem(x['name_norm'], x['product_norm']), axis=1)
        self.data[['name_norm', 'product_norm', 'cat_norm']] = self.data.apply(lambda x: self.find_product(x['name_norm'], x['product_norm'], x['cat_norm']), axis=1)
        self.__print_logs('Use Mystem for lemmatization and the third attempt to find a product:', verbose)
        
        # Find category:
        self.data[['product_norm', 'cat_norm']] = self.data.apply(lambda x: self.find_category(x['product_norm'], x['cat_norm']), axis=1)
        self.__print_logs('Find the remaining categories:', verbose)
        
        # Find product by brand:
        self.data[['product_norm', 'brand_norm', 'cat_norm']] = self.data.apply(lambda x: self.find_product_by_brand(x['product_norm'], x['brand_norm'], x['cat_norm']), axis=1)
        self.__print_logs('Find product by brand:', verbose)
        return self.data

In [11]:
%%time
finder = Finder(tmp)
res = finder.find_all(verbose=1)
# res = pd.concat([df['name'], res], axis=1)
# Before:
# Recognized brands: 11344/32091, products: 350/32091, categories: 0/32091
# ------------------------------------------------------------

# Find brands:
# Recognized brands: 17400/32091, products: 350/32091, categories: 0/32091
# ------------------------------------------------------------

# Find product and category:
# Recognized brands: 17400/32091, products: 29457/32091, categories: 29107/32091
# ------------------------------------------------------------

# Remove `-` and the second attempt to find a product:
# Recognized brands: 17400/32091, products: 29555/32091, categories: 29205/32091
# ------------------------------------------------------------

# Use Mystem for lemmatization and the third attempt to find a product:
# Recognized brands: 17400/32091, products: 29909/32091, categories: 29559/32091
# ------------------------------------------------------------

# Find the remaining categories:
# Recognized brands: 17400/32091, products: 29909/32091, categories: 29897/32091
# ------------------------------------------------------------

# Find product by brand:
# Recognized brands: 17400/32091, products: 30493/32091, categories: 30481/32091
# ------------------------------------------------------------

# Wall time: 3min 19s

Before:
Recognized brands: 11366/32091, products: 240/32091, categories: 0/32091
--------------------------------------------------------------------------------

Find brands:
Recognized brands: 17535/32091, products: 240/32091, categories: 0/32091
--------------------------------------------------------------------------------

Find product and category:
Recognized brands: 17535/32091, products: 29448/32091, categories: 29208/32091
--------------------------------------------------------------------------------

Remove `-` and the second attempt to find a product:
Recognized brands: 17535/32091, products: 29546/32091, categories: 29306/32091
--------------------------------------------------------------------------------

Use Mystem for lemmatization and the third attempt to find a product:
Recognized brands: 17535/32091, products: 29917/32091, categories: 29677/32091
--------------------------------------------------------------------------------

Find the remaining categories:
Recog

ValueError: Lengths must match to compare

### TODO:

1. Вынести все собранные словари в отдельные классы
2. Подумать насчёт того, чтобы совместить мой список со списком брендов
3. Переводить всё в .py файлы
4. Настроить красивые логи
5. Добавить скетч карту в общие данные


In [59]:
'Волжское утро'.lower()

'волжское утро'

In [63]:
res[res['brand_norm'].apply(lambda x: bool(x))].sample(10)

Unnamed: 0,name,name_norm,product_norm,brand_norm,cat_norm
61130,Майонез MR.RICCO Провансаль 67% д/п 400,майонез провансаль,майонез,mr. ricco,"Соусы, орехи, консервы"
22759,*4007247 Сред.FINISH POWERBALL 100таб,средство,средство,finish powerball,Товары для дома и дачи
49424,СЫРКИ ТВОРОБУШКИ 21%,сырки,сырки,творобушки,"Молоко, сыр, яйца"
14933,17:3634637 БЗМЖ Сливки САРАФАНОВО 200мл,сливки,сливки,сарафаново,"Молоко, сыр, яйца"
1007,БЗМЖ МОРОЖЕН.ВОЛОГОДСКИЙ ПЛОМБ,мороженое,мороженое,вологодский пломбир,"Молоко, сыр, яйца"
742,ПУАРЕ TWO GEESE/ДВА ГУСЯ ГРУШ.,пуаре два гусь,,geese,
73625,Напиток Coca-Cola без сахара б,напиток без сахара,напиток,coca-cola,"Воды, соки, напитки"
43216,ГРИНФИЛД САМ БУКЕТ25,гринфиса,,LD,
20038,27*: 3142744 ПРОСТОКВАШИНО Сметана 15%,ино сметана,сметана,простоквашино,"Молоко, сыр, яйца"
29019,*3486073 ЛЮБЯТ.Печ.сдоб.вк.лим/мята250г,лим,печенье,любятово,"Хлеб, сладости, снеки"


In [29]:
q['Категория'].unique()

array(['Красота, гигиена, бытовая химия', 'Хлеб, сладости, снеки',
       'Молоко, сыр, яйца', 'Соусы, орехи, консервы',
       'Макароны, крупы, специи', 'Другое', 'Воды, соки, напитки',
       'Рыба, икра', 'Чай, кофе, сахар', 'Замороженные продукты',
       'Птица, мясо, деликатесы', 'Посуда', 'Овощи, фрукты, ягоды',
       'Товары для мам и детей', 'Товары для дома и дачи',
       'Подборки и готовые блюда', 'Зоотовары', 'Бытовая техника',
       'Алкоголь', 'Дача и гриль'], dtype=object)

In [27]:
q = pd.read_csv('clean_data/products.csv')

In [70]:
res['name'].str.replace('-', '-')

19       МИНТАЙ ФИЛЕ Б/К СВ/МОР П/ПАК 8
20       БЗМЖ МОЛОКО СТРАНА ВАСИЛЬКИ У/
21       СУШКИ ТАРАЛЛИНИ С ЧЕСНОКОМ 180
22       ЯЙЦО КУРИНОЕ СТОЛОВОЕ 1КАТЕГОР
23       ПЕЧЕНЬЕ ЮБИЛЕЙНОЕ ВИТАМИНИЗИРО
                      ...              
74012           БЗМЖ Сыр Российский 1кг
74013    Колбаса вареная Докторская Ста
74015    БЗМЖ Сырок Б.Ю. Александров Ка
74017      Карбонад Велком к/в в/у 450г
74018    БЗМЖ Сыр творожный Hochland сл
Name: name, Length: 32091, dtype: object

In [36]:
q[q['Продукт'] == 'сардельки']

Unnamed: 0,Продукт,Категория
291,сардельки,"Птица, мясо, деликатесы"


In [151]:
q.drop(index=461).to_csv('clean_data/products.csv', index=False)

In [30]:
q.loc[638]['Категория'] = 'Молоко, сыр, яйца'

In [32]:
q.loc[950]

Продукт              халат махровый
Категория    Товары для дома и дачи
Name: 950, dtype: object

In [115]:
q.to_csv('clean_data/products.csv', index=False)

In [284]:
'макароны' in q['Продукт'].values

True

In [60]:
l = ['сгущенка', 'открытка', 'поджарка', 'белок', 'плитка', 'окорочка', 'гвоздика', 'горошек', 'редька', 'рассол', 'пасха', 'кислота лимонная']

In [38]:
q['Категория'].unique()

array(['Красота, гигиена, бытовая химия', 'Хлеб, сладости, снеки',
       'Молоко, сыр, яйца', 'Соусы, орехи, консервы',
       'Макароны, крупы, специи', 'Другое', 'Воды, соки, напитки',
       'Рыба, икра', 'Чай, кофе, сахар', 'Замороженные продукты',
       'Птица, мясо, деликатесы', 'Посуда', 'Овощи, фрукты, ягоды',
       'Товары для мам и детей', 'Товары для дома и дачи',
       'Подборки и готовые блюда', 'Зоотовары', 'Бытовая техника',
       'Алкоголь', 'Дача и гриль'], dtype=object)

In [78]:
tmp = pd.DataFrame.from_dict(white_list, orient='index').reset_index().rename(columns={'index': 'Продукт', 0: 'Категория'})

In [125]:
l = ['яблоко', 'долма', 'разрыхлитель', 'бумага', 'кукуруза']

In [21]:
for i in df.product_norm.dropna().drop_duplicates().values:
    if i not in q['Продукт'].values:
        print(i)

тулетная бумага
шариковая ручка
пучок


In [22]:
tmp = pd.DataFrame(
    [
        ['тулетная бумага', 'Красота, гигиена, бытовая химия'],
        ['шариковая ручка', 'Другое'],
        ['пучок', 'Овощи, фрукты, ягоды'],
    ], columns=['Продукт', 'Категория']
)

In [23]:
pd.concat([q, tmp]).to_csv('clean_data/products.csv', index=False)

In [85]:
q.shape

(1265, 2)

In [88]:
q.drop_duplicates(subset='Продукт').to_csv('clean_data/products.csv', index=False)

In [279]:
from collections import Counter

C = Counter()
def lol(name):
    global C
    for i in name.split():
        C[i]+=1



In [280]:
res[res['product_norm'].apply(lambda x: not bool(x))]['name_norm'].apply(lol);