In [21]:
# Text processing designation and description 
from sklearn import neighbors
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Data 
X_train = pd.read_csv('/Users/saraskorupa/Documents/Data Science:ML/MLE : DS - Bootcamp /Rakuten Project/files/X_train_update.csv', index_col='Unnamed: 0')
X_test = pd.read_csv('files/X_test_update.csv', index_col='Unnamed: 0')
y_train = pd.read_csv('files/Y_train_CVw08PX.csv', index_col='Unnamed: 0')

# Feature Training Data
display(X_train.head(10))

X_train.info()

Unnamed: 0,designation,description,productid,imageid
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786
5,Afrique Contemporaine N° 212 Hiver 2004 - Doss...,,5862738,393356830
6,Christof E: Bildungsprozessen Auf Der Spur,,91920807,907794536
7,Conquérant Sept Cahier Couverture Polypro 240 ...,CONQUERANT CLASSIQUE Cahier 240 x 320 mm seyès...,344240059,999581347
8,Puzzle Scooby-Doo Avec Poster 2x35 Pieces,,4239126071,1325918866
9,Tente Pliante V3s5-Pro Pvc Blanc - 3 X 4m50 - ...,Tente pliante V3S5 Pro PVC 500 gr/m² - 3 x 4m5...,3793572222,1245644185


<class 'pandas.core.frame.DataFrame'>
Index: 84916 entries, 0 to 84915
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   designation  84916 non-null  object
 1   description  55116 non-null  object
 2   productid    84916 non-null  int64 
 3   imageid      84916 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 3.2+ MB


In [22]:
# Data Cleaning 
# Filling missing values in description from designation
X_train['description'] = X_train['description'].fillna(X_train['designation'])

# dropping missing values in description 
# X_train.dropna(subset=['description'])
X_train.info()

# drop duplicates
# Check DUBPLICTAES based on all columns
duplicates = X_train[X_train.duplicated()]
# View duplicates
print(duplicates)
# Now drop duplicates if necessary
X_train = X_train.drop_duplicates()



<class 'pandas.core.frame.DataFrame'>
Index: 84916 entries, 0 to 84915
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   designation  84916 non-null  object
 1   description  84916 non-null  object
 2   productid    84916 non-null  int64 
 3   imageid      84916 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 3.2+ MB
Empty DataFrame
Columns: [designation, description, productid, imageid]
Index: []


KeyboardInterrupt: 

In [14]:
# Text cleaning
import re

# removing special characters, punctuation, and digits
def clean_text(text):
    return re.sub(r'[^a-zA-Z\s]', '', text)

X_train['description'] = X_train['description'].apply(clean_text)
X_train['designation'] = X_train['designation'].apply(clean_text)

# Lowercase
X_train['description'] = X_train['description'].str.lower()
X_train['designation'] = X_train['designation'].str.lower()


In [30]:
# Removing stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

X_train['description'] = X_train['description'].apply(remove_stopwords)
X_train['designation'] = X_train['designation'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saraskorupa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
from nltk.tokenize import PunktSentenceTokenizer
tokenizer = PunktSentenceTokenizer()
tokenizer.tokenize(X_train['description'][0])

['Olivia: Personalisiertes Notizbuch / 150 Seiten / Punktraster / Ca Din A5 / Rosen-Design']

In [29]:
# Tokenization of description
import spacy

# Load the French language model
nlp_fr = spacy.load('fr_core_news_sm')

# Tokenize the first 10 rows of the description variable
X_train['description_tokenized'] = X_train['description'].head(10).apply(lambda x: [token.text for token in nlp_fr(x)])

# Display the result
print(X_train[['description', 'description_tokenized']].head(10))



                                         description  \
0  Olivia: Personalisiertes Notizbuch / 150 Seite...   
1  Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...   
2  PILOT STYLE Touch Pen de marque Speedlink est ...   
3  Peluche Donald - Europe - Disneyland 2000 (Mar...   
4  Luc des id&eacute;es de grandeur. Il veut orga...   
5  Afrique Contemporaine N° 212 Hiver 2004 - Doss...   
6         Christof E: Bildungsprozessen Auf Der Spur   
7  CONQUERANT CLASSIQUE Cahier 240 x 320 mm seyès...   
8          Puzzle Scooby-Doo Avec Poster 2x35 Pieces   
9  Tente pliante V3S5 Pro PVC 500 gr/m² - 3 x 4m5...   

                               description_tokenized  
0  [Olivia, :, Personalisiertes, Notizbuch, /, 15...  
1  [Journal, Des, Arts, (, Le, ), N, °, 133, Du, ...  
2  [PILOT, STYLE, Touch, Pen, de, marque, Speedli...  
3  [Peluche, Donald, -, Europe, -, Disneyland, 20...  
4  [Luc, des, id&eacute;es, de, grandeur, ., Il, ...  
5  [Afrique, Contemporaine, N, °, 212, Hiver, 200... 

In [33]:
# Lemmatization
import spacy

# Load the language model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize text
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Apply lemmatization to the 'description' and 'designation' columns
X_train['description'] = X_train['description'].apply(lemmatize_text)
X_train['designation'] = X_train['designation'].apply(lemmatize_text)

KeyboardInterrupt: 

In [20]:
import pandas as pd

# Function to process text in batches
def process_in_batches(df, batch_size=1000):
    for start in range(0, len(df), batch_size):
        end = min(start + batch_size, len(df))
        batch = df.iloc[start:end]
        # Apply lemmatization here to the batch
        batch['designation'] = batch['designation'].apply(lemmatize_text)
        batch['description'] = batch['description'].apply(lemmatize_text)
        # You can save or process the batch results here
        yield batch

# Example usage:
for batch in process_in_batches(X_train):
    # Process each batch (e.g., saving or analyzing results)
    pass



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['designation'] = batch['designation'].apply(lemmatize_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['description'] = batch['description'].apply(lemmatize_text)


In [None]:
# language detection - language translate 
from googletrans import Translator

translator = Translator()

def translate_text(text, target_language='en'):
    try:
        translated = translator.translate(text, dest=target_language)
        return translated.text
    except Exception as e:
        return text  # Return the original text if there's an error

# Apply translation to 'description' and 'designation' columns
X_train['description'] = X_train['description'].apply(lambda x: translate_text(x, 'en') if pd.notnull(x) else x)
X_train['designation'] = X_train['designation'].apply(lambda x: translate_text(x, 'en') if pd.notnull(x) else x)
