In [2]:
from sklearn import neighbors
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Data 
X_train = pd.read_csv('/Users/saraskorupa/Documents/Data Science:ML/MLE : DS - Bootcamp /Rakuten Project/files/X_train_update.csv', index_col='Unnamed: 0')
X_test = pd.read_csv('files/X_test_update.csv', index_col='Unnamed: 0')
y_train = pd.read_csv('files/Y_train_CVw08PX.csv', index_col='Unnamed: 0')

In [4]:
# Feature Training Data
X_train.head(10)

Unnamed: 0,designation,description,productid,imageid
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786
5,Afrique Contemporaine N° 212 Hiver 2004 - Doss...,,5862738,393356830
6,Christof E: Bildungsprozessen Auf Der Spur,,91920807,907794536
7,Conquérant Sept Cahier Couverture Polypro 240 ...,CONQUERANT CLASSIQUE Cahier 240 x 320 mm seyès...,344240059,999581347
8,Puzzle Scooby-Doo Avec Poster 2x35 Pieces,,4239126071,1325918866
9,Tente Pliante V3s5-Pro Pvc Blanc - 3 X 4m50 - ...,Tente pliante V3S5 Pro PVC 500 gr/m² - 3 x 4m5...,3793572222,1245644185


In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84916 entries, 0 to 84915
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   designation  84916 non-null  object
 1   description  55116 non-null  object
 2   productid    84916 non-null  int64 
 3   imageid      84916 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 3.2+ MB


In [11]:
# Check for missing values count in each column
missing_values = X_train.isnull().sum()
print(missing_values)

X_train.head(10)

designation            0
description            0
productid              0
imageid                0
cleaned_description    0
dtype: int64


Unnamed: 0,designation,description,productid,imageid,cleaned_description
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,Description missing,3804725264,1263597046,description missing
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,Description missing,436067568,1008141237,description missing
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,pilot style touch pen de marque speedlink est ...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,Toy Description,50418756,457047496,toy description
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,luc de ides de grandeur il veut organiser un j...
5,Afrique Contemporaine N° 212 Hiver 2004 - Doss...,Description missing,5862738,393356830,description missing
6,Christof E: Bildungsprozessen Auf Der Spur,Description missing,91920807,907794536,description missing
7,Conquérant Sept Cahier Couverture Polypro 240 ...,CONQUERANT CLASSIQUE Cahier 240 x 320 mm seyès...,344240059,999581347,conquerant classique cahier x mm seys incolore...
8,Puzzle Scooby-Doo Avec Poster 2x35 Pieces,Description missing,4239126071,1325918866,description missing
9,Tente Pliante V3s5-Pro Pvc Blanc - 3 X 4m50 - ...,Tente pliante V3S5 Pro PVC 500 gr/m² - 3 x 4m5...,3793572222,1245644185,tente pliante v pro pvc grm x mque vous soyez ...


In [6]:
# Cleaning text data - description 
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import nltk

# Download NLTK resources if you don't have them installed
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Clean function to apply all text cleaning steps
def clean_text(text):
    if pd.isna(text):  # If text is NaN, return empty string
        return ''
    
    # Step 1: Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Step 2: Remove special characters, digits, and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    
    # Step 3: Convert to lowercase
    text = text.lower()
    
    # Step 4: Remove extra whitespaces
    text = ' '.join(text.split())
    
    # Step 5: Remove stopwords (optional, depends on your use case)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Step 6: Lemmatization (optional, depends on your use case)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

# Assuming X_train is your DataFrame and 'description' is the column to clean
X_train['cleaned_description'] = X_train['description'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saraskorupa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/saraskorupa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [7]:
# Apply rules based on designation to fill in descriptions
def derive_description(designation):
    if 'Peluche' in designation:
        return 'Toy Description'
    elif 'Gamepad' in designation:
        return 'Gaming accessory'
    else:
        return 'Description missing'

X_train['description'] = X_train['description'].fillna(X_train['designation'].apply(derive_description))

# Count entries with "Description missing"
description_missing_count = X_train['description'].value_counts().get('Description missing', 0)

print(f"Number of 'Description missing' entries: {description_missing_count}")



Number of 'Description missing' entries: 29560


In [12]:
# Checking Language in designation
from langdetect import detect

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'  # In case of any error (e.g., empty or non-standard text)

# Apply the function to your dataframe
X_train['designation_language'] = X_train['designation'].apply(detect_language)
X_train['description_language'] = X_train['description'].apply(detect_language)

# Check the first few rows to see the results
print(X_train[['designation', 'designation_language', 'description', 'description_language']].head())

                                         designation designation_language  \
0  Olivia: Personalisiertes Notizbuch / 150 Seite...                   de   
1  Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...                   fr   
2  Grand Stylet Ergonomique Bleu Gamepad Nintendo...                   en   
3  Peluche Donald - Europe - Disneyland 2000 (Mar...                   de   
4                               La Guerre Des Tuques                   ca   

                                         description description_language  
0                                Description missing                   en  
1                                Description missing                   en  
2  PILOT STYLE Touch Pen de marque Speedlink est ...                   fr  
3                                    Toy Description                   en  
4  Luc a des id&eacute;es de grandeur. Il veut or...                   fr  


In [9]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 84916 entries, 0 to 84915
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   prdtypecode  84916 non-null  int64
dtypes: int64(1)
memory usage: 1.3 MB
