In [35]:
import kagglehub
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from xgboost import XGBClassifier

In [36]:


print("ðŸ”µ Ã‰TAPE 1 : Chargement...")

# 1. TÃ©lÃ©chargement
path = kagglehub.dataset_download("jiashenliu/515k-hotel-reviews-data-in-europe")
csv_file_path = f"{path}/Hotel_Reviews.csv"

# 2. Lecture du CSV
df = pd.read_csv(csv_file_path)
print(f"âœ… DonnÃ©es chargÃ©es : {df.shape[0]} avis.")

ðŸ”µ Ã‰TAPE 1 : Chargement...
Using Colab cache for faster access to the '515k-hotel-reviews-data-in-europe' dataset.
âœ… DonnÃ©es chargÃ©es : 515738 avis.


In [37]:
print("\nðŸ”µ Ã‰TAPE 2 : PrÃ©paration des donnÃ©es...")

# 1. Texte complet et Cible
df['Review'] = df['Positive_Review'] + " " + df['Negative_Review']
df['is_good_hotel'] = df['Reviewer_Score'].apply(lambda x: 1 if x > 8.0 else 0)

# 2. Pays (depuis l'adresse)
df['country'] = df['Hotel_Address'].apply(lambda x: x.split()[-1])

# 3. Tags (Voyageur, Couple, Famille...)
def check_tag(tags, word):
    return 1 if word in tags else 0

tags_to_check = ['Leisure trip', 'Couple', 'Solo traveler', 'Family', 'Business trip']
for tag in tags_to_check:
    col_name = f"is_{tag.split()[0].lower()}" # ex: is_leisure
    df[col_name] = df['Tags'].apply(lambda x: check_tag(x, tag))

# 4. DurÃ©e du sÃ©jour
df['stay_duration'] = df['Tags'].str.extract(r'Stayed (\d+) night').astype(float).fillna(1)

# 5. Ã‰chantillonnage (50% pour la vitesse) & Sentiment Analysis
# On crÃ©e une copie propre pour travailler
print("   Calcul du sentiment (patience...)...")
df_subset = df.sample(frac=0.5, random_state=42).copy()
df_subset['sentiment_score'] = df_subset['Review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

print("âœ… DonnÃ©es prÃªtes pour le modÃ¨le Hybride !")


ðŸ”µ Ã‰TAPE 2 : PrÃ©paration des donnÃ©es...
   Calcul du sentiment (patience...)...
âœ… DonnÃ©es prÃªtes pour le modÃ¨le Hybride !


In [38]:
print("\nðŸ”µ Ã‰TAPE 3 : ModÃ¨le Hybride (Texte + Chiffres)...")

# 1. Vectorisation du TEXTE
print("   Vectorisation du texte...")
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
X_text = tfidf.fit_transform(df_subset['Review'].astype(str))

# 2. PrÃ©paration des CHIFFRES
# On gÃ¨re les pays ici pour Ãªtre sÃ»r qu'ils soient alignÃ©s
country_dummies = pd.get_dummies(df_subset['country'], prefix='Country')

numeric_features = [
    'Average_Score', 'Review_Total_Negative_Word_Counts', 'Review_Total_Positive_Word_Counts',
    'Total_Number_of_Reviews', 'is_leisure', 'is_couple', 'is_solo',
    'is_family', 'is_business', 'stay_duration', 'sentiment_score'
]

# On colle tout ensemble (Chiffres de base + Pays)
X_numeric_df = pd.concat([df_subset[numeric_features], country_dummies], axis=1)

# Conversion forcÃ©e en float pour Ã©viter l'erreur "ValueError: object"
X_numeric = X_numeric_df.apply(pd.to_numeric, errors='coerce').fillna(0).astype(float).values

# 3. FUSION (Hybride)
print(f"   Fusion : {X_text.shape} (Texte) + {X_numeric.shape} (NumÃ©rique)")
X_hybrid = hstack([X_text, X_numeric])
y_hybrid = df_subset['is_good_hotel']

# 4. EntraÃ®nement
X_train, X_test, y_train, y_test = train_test_split(X_hybrid, y_hybrid, test_size=0.2, random_state=42)

print("   Lancement de XGBoost...")
model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42
)
model.fit(X_train, y_train)

# 5. RÃ©sultat
acc = accuracy_score(y_test, model.predict(X_test))
print("\n" + "="*40)
print(f"ðŸ”¥ PRÃ‰CISION FINALE : {acc:.2%}")
print("="*40)


ðŸ”µ Ã‰TAPE 3 : ModÃ¨le Hybride (Texte + Chiffres)...
   Vectorisation du texte...
   Fusion : (257869, 1000) (Texte) + (257869, 17) (NumÃ©rique)
   Lancement de XGBoost...

ðŸ”¥ PRÃ‰CISION FINALE : 80.66%


In [40]:
import joblib
from google.colab import files

print("ðŸ’¾ Sauvegarde en cours...")

# 1. On sauvegarde le modÃ¨le XGBoost
joblib.dump(model, 'modele_hotel_xgboost.pkl')

# 2. On sauvegarde le Vectoriseur TF-IDF (Indispensable !)
joblib.dump(tfidf, 'vectorizer_tfidf.pkl')

print("âœ… Fichiers crÃ©Ã©s ! TÃ©lÃ©chargement vers ton PC...")

# 3. TÃ©lÃ©chargement automatique vers ton ordinateur
files.download('modele_hotel_xgboost.pkl')
files.download('vectorizer_tfidf.pkl')

ðŸ’¾ Sauvegarde en cours...
âœ… Fichiers crÃ©Ã©s ! TÃ©lÃ©chargement vers ton PC...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>