In [1]:
# Data Loading 
import pandas as pd

# Load the CSV file
df = pd.read_csv('/Users/saraskorupa/Documents/rakuten-product-classification/files/X_train_update.csv', index_col='Unnamed: 0')
# Create a raw copy before any preprocessing
df_raw = df.copy()  
# Display the first few row
display(df.head())
df.info()

Unnamed: 0,designation,description,productid,imageid
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786


<class 'pandas.core.frame.DataFrame'>
Index: 84916 entries, 0 to 84915
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   designation  84916 non-null  object
 1   description  55116 non-null  object
 2   productid    84916 non-null  int64 
 3   imageid      84916 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 3.2+ MB


In [2]:
# Data Preparation & cleaning
import re

def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r'<.*?>', '', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    return text

df['designation'] = df['designation'].apply(clean_text)
df['designation'] = df['designation'].apply(clean_text)

In [3]:
# Stoword removal (Multilanguage)
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english') + stopwords.words('french') + stopwords.words('german') + stopwords.words('dutch') + stopwords.words('italian') + stopwords.words('spanish') + stopwords.words('romanian') + stopwords.words('portuguese') + stopwords.words('indonesian') + stopwords.words('russian') + stopwords.words('danish') + stopwords.words('slovene'))
df['designation'] = df['designation'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words])) 

# Handling Accents & Unicode Normalization 
import unicodedata
def remove_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

df['designation'] = df['designation'].apply(remove_accents)

In [4]:
# Tokenization 
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt', download_dir='/Users/saraskorupa/nltk_data')
nltk.data.path.append('/Users/saraskorupa/nltk_data')  # Ensure it's added to search paths

df['designation'] = df['designation'].astype(str)  # Ensure all values are strings
df['designation'] = df['designation'].apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saraskorupa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/saraskorupa/nltk_data'
    - '/opt/anaconda3/nltk_data'
    - '/opt/anaconda3/share/nltk_data'
    - '/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/Users/saraskorupa/nltk_data'
**********************************************************************


In [5]:
# Lemmatization 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['designation'] = df['designation'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [6]:
# N-grams Analysis 
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english')
df['designation'] = df['designation'].apply(lambda x: ''.join(x) if isinstance(x, list) else str(x))
X_text = vectorizer.fit_transform(df['designation'])

In [7]:
# Text Vectorization 
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_text = vectorizer.fit_transform(df['designation'])

In [8]:
# Handling missing values
# Step 3: Fill Missing Descriptions with Extracted Keywords
def fill_missing_description(row):
    if not row['description']:  # If description is missing (empty list)
        relevant_words = [word for word in row['designation'] if word in top_keywords]  # Extract relevant words
        return relevant_words if relevant_words else ["unknown"]  # Fill with keywords or "unknown"
    return row['description']
  
df['description'] = df.apply(fill_missing_description, axis=1)

df.head(20)

Unnamed: 0,designation,description,productid,imageid
0,olivia personalisiertes notizbuch 150 seiten p...,,3804725264,1263597046
1,journal arts 133 28092001 lart marche salon da...,,436067568,1008141237
2,grand stylet ergonomique bleu gamepad nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,peluche donald europe disneyland 2000 marionne...,,50418756,457047496
4,guerre tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786
5,afrique contemporaine 212 hiver 2004 dossier j...,,5862738,393356830
6,christof bildungsprozessen spur,,91920807,907794536
7,conquerant sept cahier couverture polypro 240 ...,CONQUERANT CLASSIQUE Cahier 240 x 320 mm seyès...,344240059,999581347
8,puzzle scoobydoo poster 2x35 pieces,,4239126071,1325918866
9,tente pliante v3s5pro pvc blanc 3 x 4m50 longu...,Tente pliante V3S5 Pro PVC 500 gr/m² - 3 x 4m5...,3793572222,1245644185
