Load Environment Variables

In [12]:
import os
from dotenv import load_dotenv

# Load API keys from .env file
load_dotenv()
OPENWEATHER_API_KEY = os.getenv("OPENWEATHER_API_KEY")
GOOGLE_MAPS_API_KEY = os.getenv("GOOGLE_MAPS_API_KEY")
# Ensure API keys are loaded, otherwise raise a helpful error
if not OPENWEATHER_API_KEY or not GOOGLE_MAPS_API_KEY:
    print("Warning: API keys not found in .env file. Create a .env file with OPENWEATHER_API_KEY and GOOGLE_MAPS_API_KEY.")

Import Libs

In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import datetime
import random
import requests
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger('travel_recommender')

Load and Preprocess Data

In [14]:
def load_data(file_path='holidify.csv'):
    """
    Load and preprocess the tourism dataset
    
    Parameters:
    -----------
    file_path : str
        Path to the CSV file containing tourism data
        
    Returns:
    --------
    pd.DataFrame
        Preprocessed tourism dataframe
    """
    try:
        logger.info(f"Loading data from {file_path}")
        df = pd.read_csv(file_path)
        
        # Remove duplicates based on City column
        initial_count = len(df)
        df = df.drop_duplicates(subset=['City'])
        logger.info(f"Removed {initial_count - len(df)} duplicate cities")
        
        # Clean city names (remove extra spaces)
        df['City'] = df['City'].str.strip()
        
        # Fill NaN values in 'Best Time to visit'
        df['Best Time to visit'] = df['Best Time to visit'].fillna('Throughout the year')
        
        # Create a column indicating if a destination is good to visit year-round
        df['Year_round'] = df['Best Time to visit'].str.contains('Throughout the year').astype(int)
        
        # Add columns for weather and traffic (will be filled later)
        df['Weather Quality'] = np.nan
        df['Traffic Level'] = np.nan
        
        logger.info(f"Successfully loaded and preprocessed {len(df)} destinations")
        return df
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise

Configuration and Parameters

In [15]:
def configure_recommendation_system(current_location, travel_date, avoid_crowds=True, 
                                   weather_preference="clear", budget_level="medium", 
                                   use_real_apis=True):
    """
    Configure the recommendation system with user preferences
    
    Parameters:
    -----------
    current_location : str
        User's current city
    travel_date : str or datetime
        Planned travel date (YYYY-MM-DD if string)
    avoid_crowds : bool
        Whether to prioritize destinations with less traffic
    weather_preference : str
        Preferred weather condition ("clear", "moderate", etc.)
    budget_level : str
        Budget preference ("low", "medium", "high")
    use_real_apis : bool
        Whether to use real API calls or mock data
        
    Returns:
    --------
    dict
        Configuration dictionary with all parameters
    """
    # Convert string date to datetime
    if isinstance(travel_date, str):
        travel_date = datetime.datetime.strptime(travel_date, "%Y-%m-%d")
    
    # Map text preferences to numerical values for the model
    weather_preference_map = {
        "clear": 9,
        "moderate": 7,
        "cool": 5,
        "warm": 8,
        "rainy": 3,
        "snowy": 4
    }
    
    budget_level_map = {
        "low": 1,
        "medium": 2,
        "high": 3
    }
    
    # Create configuration dictionary
    config = {
        "current_location": current_location,
        "travel_date": travel_date,
        "avoid_crowds": avoid_crowds,
        "weather_preference_text": weather_preference,
        "weather_preference_score": weather_preference_map.get(weather_preference.lower(), 7),
        "budget_level_text": budget_level,
        "budget_level_score": budget_level_map.get(budget_level.lower(), 2),
        "use_real_apis": use_real_apis,
        "api_keys": {
            "weather": OPENWEATHER_API_KEY,
            "maps": GOOGLE_MAPS_API_KEY
        }
    }
    
    logger.info(f"Configured system for travel from {current_location} on {travel_date.strftime('%Y-%m-%d')}")
    return config

Weather Data Retrival

In [16]:
import requests
import logging
import random

# Logger setup
logger = logging.getLogger(__name__)

def get_weather_data(cities, api_key, use_real_api=True):
    api_key = "6de43994eab17f60d55c448f8162c3f7"
    """
    Get current weather data for a list of cities
    
    Parameters:
    -----------
    cities : list
        List of city names
    api_key : str
        OpenWeatherMap API key
    use_real_api : bool
        Whether to use real API or dummy data for testing
        
    Returns:
    --------
    dict
        Dictionary with weather data for each city
    """
    weather_data = {}

    if not use_real_api:
        # Generate mock data for testing
        logger.info("Using mock weather data")
        weather_conditions = ["clear sky", "few clouds", "scattered clouds", "moderate rain", "light rain", "sunny"]
        for city in cities:
            temp = round(random.uniform(15, 35), 2)  # Random temp between 15-35°C
            condition = random.choice(weather_conditions)
            quality = 10 if 20 <= temp <= 30 and "rain" not in condition else (7 if 15 <= temp <= 35 else 4)
            
            weather_data[city] = {
                "condition": condition,
                "temp": temp,
                "quality": quality
            }
        return weather_data

    # Use real API
    logger.info(f"Fetching weather data for {len(cities)} cities")

    for city in cities:
        try:
            # OpenWeatherMap API URL (Corrected)
            url = f"http://api.openweathermap.org/data/2.5/weather?q={city}&appid={api_key}&units=metric"
            response = requests.get(url)
            data = response.json()

            if response.status_code == 200 and "main" in data and "weather" in data:
                temp = data["main"]["temp"]
                condition = data["weather"][0]["description"]

                # Calculate weather quality score (higher is better)
                if "clear" in condition or "sunny" in condition:
                    quality = 9
                elif "cloud" in condition:
                    quality = 7
                elif "rain" in condition:
                    quality = 4
                elif "snow" in condition:
                    quality = 3
                else:
                    quality = 6

                # Adjust for temperature
                if 20 <= temp <= 30:
                    quality += 1
                elif temp < 5 or temp > 40:
                    quality -= 2

                # Ensure score is between 1-10
                quality = max(1, min(10, quality))

                weather_data[city] = {
                    "condition": condition,
                    "temp": temp,
                    "quality": quality
                }
            else:
                logger.warning(f"Failed to get weather for {city}: {data.get('message', 'Unknown error')}")
                weather_data[city] = {"condition": "unknown", "temp": None, "quality": 5}

        except Exception as e:
            logger.error(f"Error fetching weather for {city}: {str(e)}")
            weather_data[city] = {"condition": "error", "temp": None, "quality": 5}

    return weather_data


Traffic Data Retrival

In [17]:
import requests
import logging
import random

# Logger setup
logger = logging.getLogger(__name__)

def get_traffic_data(cities, current_location, api_key, use_real_api=True):
    api_key = "tjAnW2bFSmzSJJMx7NH1Y5MRunBIaxWCJqyWKFkm4deivY3pb7hG3QdKQvYrAoC5"
    """
    Get traffic data for a list of cities and their popular places
    
    Parameters:
    -----------
    cities : list
        List of city names
    current_location : str
        User's current location
    api_key : str
        Google Maps API key
    use_real_api : bool
        Whether to use real API or dummy data for testing
        
    Returns:
    --------
    dict
        Dictionary with traffic data for each city
    """
    traffic_data = {}
    
    if not use_real_api:
        # Generate mock data for testing
        logger.info("Using mock traffic data")
        for city in cities:
            traffic_level = random.randint(2, 9)# Mock overall traffic level (1-10 scale)
            places = {f"{city} Place {i}": random.randint(15, 60) for i in range(1, 6)}  # Mock travel time
                
            traffic_data[city] = {
                "traffic_level": traffic_level,
                "places": places
            }
        return traffic_data
    
    #Use real Distance Matrix API
    logger.info(f"Fetching traffic data for {len(cities)} cities from {current_location}")
    try:
        # Construct destinations query string (latitude,longitude format)
        destinations = "|".join(cities)

        # Corrected Google Distance Matrix API request
        url = (
            f"https://api.distancematrix.ai/maps/api/distancematrix/json?origins=51.4822656,-0.1933769&destinations=51.4994794,-0.1269979&key=tjAnW2bFSmzSJJMx7NH1Y5MRunBIaxWCJqyWKFkm4deivY3pb7hG3QdKQvYrAoC5"
            f"origins={current_location}&destinations={destinations}&key={api_key}&departure_time=now&traffic_model=best_guess"
        )
        response = requests.get(url)
        data = response.json()

        if data["status"] != "OK":
            logger.error(f"Error in API response: {data}")
            return traffic_data
        
        for i, city in enumerate(cities):
            try:
                element = data["rows"][0]["elements"][i]
                if element["status"] == "OK":
                    travel_time = element["duration_in_traffic"]["value"] // 60  # Convert to minutes
                    traffic_level = min(max(travel_time // 10, 1), 10)  # Normalize traffic level (1-10 scale)
                else:
                    travel_time = random.randint(20, 60)  # Fallback value
                    traffic_level = random.randint(3, 7)

                places = {f"{city} Attraction {i}": travel_time for i in range(1, 6)}

                traffic_data[city] = {
                    "traffic_level": traffic_level,
                    "places": places
                }

            except Exception as e:
                logger.error(f"Error processing traffic data for {city}: {str(e)}")
                traffic_data[city] = {"traffic_level": 5, "places": {}}

    except Exception as e:
        logger.error(f"Failed to fetch traffic data: {str(e)}")

    return traffic_data


Season check Functionality

In [18]:
def is_in_season(city_info, current_date):
    """
    Determine if a destination is currently in its recommended visit season
    
    Parameters:
    -----------
    city_info : dict or Series
        Information about the city including 'Best Time to visit'
    current_date : datetime
        The date to check against
        
    Returns:
    --------
    bool
        True if the city is in season, False otherwise
    """
    best_time = city_info['Best Time to visit']
    current_month = current_date.strftime('%B')
    
    # If it's good year-round
    if 'Throughout the year' in best_time:
        return True
    
    # Parse the best time information
    month_names = ["January", "February", "March", "April", "May", "June", 
                   "July", "August", "September", "October", "November", "December"]
    
    # Handle different formats of best time information
    if 'to' in best_time:
        # Format like "October to March"
        parts = best_time.split('to')
        start_month = parts[0].strip()
        end_month = parts[1].strip()
        
        # Extract month names from text
        start_month_match = next((m for m in month_names if m.lower() in start_month.lower()), None)
        end_month_match = next((m for m in month_names if m.lower() in end_month.lower()), None)
        
        if start_month_match and end_month_match:
            start_idx = month_names.index(start_month_match)
            end_idx = month_names.index(end_month_match)
            
            # Handle ranges that span across year-end
            if start_idx > end_idx:
                months = month_names[start_idx:] + month_names[:end_idx+1]
            else:
                months = month_names[start_idx:end_idx+1]
                
            return current_month in months
    else:
        # Format might have multiple periods or just be a list of months
        for month in month_names:
            if month.lower() in best_time.lower():
                if month == current_month:
                    return True
    
    # Default to False if we couldn't parse the best time information
    return False

Content-based feature Generation

In [19]:
def generate_content_features(df):
    """
    Generate content-based features from destination descriptions
    
    Parameters:
    -----------
    df : pd.DataFrame
        Dataframe containing destination information
        
    Returns:
    --------
    numpy.ndarray
        Similarity matrix based on content features
    """
    logger.info("Generating content-based features")
    # Combine important features for content-based filtering
    df['content_features'] = df['About the city (long Description)'].fillna('') + ' ' + df['Best Time to visit'].fillna('')
    
    # Create TF-IDF features
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    
    # Handle potential empty strings
    if df['content_features'].str.strip().str.len().eq(0).any():
        logger.warning("Some destinations have empty descriptions")
        df.loc[df['content_features'].str.strip().str.len().eq(0), 'content_features'] = "No description available"
    
    try:
        tfidf_matrix = tfidf.fit_transform(df['content_features'])
        
        # Calculate similarity matrix
        similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
        logger.info(f"Generated similarity matrix of shape {similarity_matrix.shape}")
        return similarity_matrix
    except Exception as e:
        logger.error(f"Error generating content features: {str(e)}")
        # Return identity matrix as fallback
        return np.eye(len(df))

Iternary Generation

In [20]:
def generate_itinerary(destination, places_data, travel_date, weather_data):
    """
    Generate a travel itinerary for a destination
    
    Parameters:
    -----------
    destination : str
        Name of the destination
    places_data : dict
        Dictionary with places and their traffic information
    travel_date : datetime
        Date of travel
    weather_data : dict
        Weather information for the destination
        
    Returns:
    --------
    list
        List of places to visit with recommended timing
    """
    logger.info(f"Generating itinerary for {destination}")
    itinerary = []
    
    # Sort places by traffic - less traffic first
    sorted_places = sorted(places_data.items(), key=lambda x: x[1])
    
    # Weather condition affects outdoor vs indoor activities
    weather_condition = weather_data.get("condition", "unknown")
    outdoor_friendly = ("clear" in weather_condition or 
                        "sunny" in weather_condition or 
                        "few clouds" in weather_condition)
    
    morning_places = []
    afternoon_places = []
    evening_places = []
    
    # Distribute places throughout the day based on traffic
    for i, (place, travel_time) in enumerate(sorted_places):
        if travel_time < 25:  # Low traffic places
            best_time = "Morning" if outdoor_friendly else "Afternoon"
            if outdoor_friendly:
                morning_places.append((place, travel_time))
            else:
                afternoon_places.append((place, travel_time))
        elif travel_time < 40:  # Medium traffic places
            best_time = "Afternoon"
            afternoon_places.append((place, travel_time))
        else:  # High traffic places
            best_time = "Evening"
            evening_places.append((place, travel_time))
    
    # Build the itinerary
    for place_type, places, time_of_day in [
        ("outdoor" if outdoor_friendly else "indoor", morning_places, "Morning"),
        ("mixed", afternoon_places, "Afternoon"),
        ("indoor or dining", evening_places, "Evening")
    ]:
        for place, travel_time in places:
            itinerary.append({
                "Place": place,
                "Travel Time": f"{travel_time} mins",
                "Best Time to Visit": time_of_day,
                "Type": place_type
            })
    
    return itinerary

Destination Recommendation

In [21]:
def recommend_destinations(df, config, weather_data, traffic_data, similarity_matrix=None, num_recommendations=5):
    """
    Recommend destinations based on user preferences and current conditions
    
    Parameters:
    -----------
    df : pd.DataFrame
        Dataframe with destination information
    config : dict
        User preferences and configuration
    weather_data : dict
        Weather information for each city
    traffic_data : dict
        Traffic information for each city
    similarity_matrix : numpy.ndarray, optional
        Content-based similarity matrix
    num_recommendations : int
        Number of recommendations to return
        
    Returns:
    --------
    pd.DataFrame
        Dataframe with recommended destinations
    """
    logger.info(f"Generating {num_recommendations} destination recommendations")
    # Create a copy to avoid modifying the original dataframe
    recommendations = df.copy()
    
    # Extract user preferences
    weather_pref = config["weather_preference_score"]
    avoid_crowds = config["avoid_crowds"]
    travel_date = config["travel_date"]
    
    # Update dataframe with weather and traffic data
    for city in recommendations['City']:
        if city in weather_data:
            recommendations.loc[recommendations['City'] == city, 'Weather Quality'] = weather_data[city]['quality']
        
        if city in traffic_data:
            recommendations.loc[recommendations['City'] == city, 'Traffic Level'] = traffic_data[city]['traffic_level']
    
    # Fill missing values with average
    recommendations['Weather Quality'] = recommendations['Weather Quality'].fillna(recommendations['Weather Quality'].mean())
    recommendations['Traffic Level'] = recommendations['Traffic Level'].fillna(recommendations['Traffic Level'].mean())
    
    # Check if destinations are in season
    recommendations['In Season'] = recommendations.apply(
        lambda x: is_in_season(x, travel_date), axis=1
    ).astype(int)
    
    # Calculate scores
    recommendations["Weather Score"] = 10 - abs(recommendations["Weather Quality"] - weather_pref)
    
    if avoid_crowds:
        # Lower traffic is better (inverse scale)
        recommendations["Traffic Score"] = 10 - recommendations["Traffic Level"]
    else:
        # Traffic doesn't matter as much
        recommendations["Traffic Score"] = 5
    
    # Extra points for being in season
    recommendations["Season Score"] = recommendations["In Season"] * 2
    
    # For destinations with ratings (assuming 'Rating' column exists)
    if 'Rating' in recommendations.columns:
        # Normalize ratings to 0-10 scale if needed
        max_rating = recommendations['Rating'].max()
        if max_rating > 0:
            recommendations["Rating Score"] = (recommendations["Rating"] / max_rating) * 10
        else:
            recommendations["Rating Score"] = 5
    else:
        recommendations["Rating Score"] = 5
    
    # Calculate final score with weighted components
    # Prioritize weather and traffic according to user preferences
    recommendations["Final Score"] = (
        (0.4 * recommendations["Weather Score"]) + 
        (0.4 * recommendations["Traffic Score"]) + 
        (0.1 * recommendations["Season Score"]) + 
        (0.1 * recommendations["Rating Score"])
    )
    
    # Sort by final score
    recommendations = recommendations.sort_values(by="Final Score", ascending=False)
    
    # Return top recommendations
    return recommendations.head(num_recommendations)

Model Training

In [22]:
import pandas as pd
import pickle
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import StandardScaler
import random
import joblib
import logging
from datetime import datetime
import time

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler("model_training.log"), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# API Keys - Store in environment variables for better security
API_KEY_WEATHER = os.environ.get('WEATHER_API_KEY', "ae17ccc0bd407c2a7a09e95fa78d1d2d")

# Load Dataset with better error handling
def load_data(file_path="holidify.csv"):
    try:
        if not os.path.exists(file_path):
            logger.error(f"File not found: {file_path}")
            return None
            
        df = pd.read_csv(file_path)
        
        # Check if required columns exist
        required_columns = ['City', 'Best Time to visit']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            logger.error(f"Missing required columns: {missing_columns}")
            return None
            
        # Drop duplicates and clean the data
        original_count = len(df)
        df = df.drop_duplicates(subset=['City'])
        logger.info(f"Removed {original_count - len(df)} duplicate records")
        
        df['City'] = df['City'].str.strip()
        
        # Fill missing values in categorical columns
        missing_best_time = df['Best Time to visit'].isna().sum()
        if missing_best_time > 0:
            logger.info(f"Filling {missing_best_time} missing values in 'Best Time to visit'")
            df['Best Time to visit'] = df['Best Time to visit'].fillna('Throughout the year')
        
        # Add columns for feature engineering
        df['Year_round'] = df['Best Time to visit'].str.contains('Throughout the year').astype(int)
        
        logger.info(f"✅ Dataset Loaded with {len(df)} records.")
        return df
    except Exception as e:
        logger.error(f"❌ Error loading data: {str(e)}")
        return None

# Cache for weather and traffic data to avoid regenerating during testing
_weather_cache = {}
_traffic_cache = {}

# Enhanced Weather Data Generation with more realistic distributions and caching
def get_enhanced_weather_data(cities, use_cache=True):
    global _weather_cache
    
    # Return cached data if available and requested
    if use_cache and _weather_cache and set(cities).issubset(set(_weather_cache.keys())):
        logger.info("Using cached weather data")
        return {city: _weather_cache[city] for city in cities}
    
    logger.info("Generating new weather data")
    weather_data = {}
    # More realistic weather conditions with seasonal variation
    conditions = {
        "Clear": {"quality_range": (7, 10), "prob": 0.4, "temp_range": (18, 28)},
        "Cloudy": {"quality_range": (5, 8), "prob": 0.3, "temp_range": (15, 25)},
        "Rainy": {"quality_range": (3, 6), "prob": 0.2, "temp_range": (12, 22)},
        "Sunny": {"quality_range": (8, 10), "prob": 0.1, "temp_range": (22, 35)}
    }
    
    # Use a fixed seed for reproducibility during testing
    random.seed(42)
    
    for city in cities:
        # Use weighted random choice for condition
        condition = random.choices(
            list(conditions.keys()), 
            weights=[cond['prob'] for cond in conditions.values()], 
            k=1
        )[0]
        
        # Get quality range for the selected condition
        quality_range = conditions[condition]["quality_range"]
        temp_range = conditions[condition]["temp_range"]
        
        quality = random.randint(quality_range[0], quality_range[1])
        
        weather_data[city] = {
            "quality": quality,
            "condition": condition,
            "temp": random.randint(temp_range[0], temp_range[1])
        }
    
    # Update cache
    _weather_cache.update(weather_data)
    
    # Reset random seed
    random.seed(None)
    
    return weather_data

# Enhanced Traffic Data with seasonal variation and caching
def get_enhanced_traffic_data(cities, use_cache=True):
    global _traffic_cache
    
    # Return cached data if available and requested
    if use_cache and _traffic_cache and set(cities).issubset(set(_traffic_cache.keys())):
        logger.info("Using cached traffic data")
        return {city: _traffic_cache[city] for city in cities}
    
    logger.info("Generating new traffic data")
    traffic_data = {}
    
    # Define different city types with different traffic distributions
    city_types = {
        "metropolitan": {"traffic_range": (6, 10), "prob": 0.3, "attractions": (8, 15)},
        "urban": {"traffic_range": (4, 8), "prob": 0.4, "attractions": (6, 10)},
        "suburban": {"traffic_range": (2, 6), "prob": 0.2, "attractions": (4, 8)},
        "rural": {"traffic_range": (1, 4), "prob": 0.1, "attractions": (2, 5)}
    }
    
    # Use a fixed seed for reproducibility during testing
    random.seed(42)
    
    for city in cities:
        # Assign city type based on probability
        city_type = random.choices(
            list(city_types.keys()), 
            weights=[type_data['prob'] for type_data in city_types.values()], 
            k=1
        )[0]
        
        traffic_range = city_types[city_type]["traffic_range"]
        base_traffic = random.randint(traffic_range[0], traffic_range[1])
        
        # Number of attractions based on city type
        num_attractions = random.randint(*city_types[city_type]["attractions"])
        
        # Create more realistic travel times for attractions based on traffic level
        places = {}
        for i in range(1, num_attractions + 1):
            # More congested cities have longer travel times with higher variance
            min_time = 10 + (base_traffic * 2)
            max_time = 20 + (base_traffic * 5)
            places[f"{city} Place {i}"] = random.randint(min_time, max_time)
        
        traffic_data[city] = {
            "traffic_level": base_traffic,
            "city_type": city_type,
            "places": places,
            "num_attractions": num_attractions
        }
    
    # Update cache
    _traffic_cache.update(traffic_data)
    
    # Reset random seed
    random.seed(None)
    
    return traffic_data

# Feature Engineering Function with more advanced features
def engineer_features(df, weather_data, traffic_data):
    """Create additional features for better predictions"""
    start_time = time.time()
    logger.info("Starting feature engineering...")
    
    # Create copies of data to avoid modifying original
    df_processed = df.copy()
    
    # Merge Weather & Traffic Data
    for city in df_processed["City"]:
        if city in weather_data:
            df_processed.loc[df_processed["City"] == city, "Weather Quality"] = weather_data[city]["quality"]
            df_processed.loc[df_processed["City"] == city, "Temperature"] = weather_data[city].get("temp", 25)
            df_processed.loc[df_processed["City"] == city, "Weather Condition"] = weather_data[city].get("condition", "Unknown")
        if city in traffic_data:
            df_processed.loc[df_processed["City"] == city, "Traffic Level"] = traffic_data[city]["traffic_level"]
            df_processed.loc[df_processed["City"] == city, "City Type"] = traffic_data[city].get("city_type", "urban")
            df_processed.loc[df_processed["City"] == city, "Num Attractions"] = traffic_data[city].get("num_attractions", 5)

    # Fill Missing Values with more intelligent defaults based on similar cities
    # For weather quality, group by weather condition
    if "Weather Condition" in df_processed.columns and "Weather Quality" in df_processed.columns:
        condition_medians = df_processed.groupby("Weather Condition")["Weather Quality"].median().to_dict()
        
        # For each missing value, fill with the median of that condition
        for condition, median in condition_medians.items():
            mask = (df_processed["Weather Quality"].isna()) & (df_processed["Weather Condition"] == condition)
            df_processed.loc[mask, "Weather Quality"] = median
    
    # Fill remaining missing values with medians
    for col in ["Weather Quality", "Traffic Level", "Temperature", "Num Attractions"]:
        if col in df_processed.columns:
            missing_count = df_processed[col].isna().sum()
            if missing_count > 0:
                logger.info(f"Filling {missing_count} missing values in '{col}'")
                df_processed[col] = df_processed[col].fillna(df_processed[col].median())
    
    # One-hot encode categorical features
    categorical_cols = ["Weather Condition", "City Type"]
    for col in categorical_cols:
        if col in df_processed.columns:
            df_processed = pd.get_dummies(df_processed, columns=[col], prefix=col.replace(" ", "_"))
    
    # Create interaction features
    df_processed["Weather_Traffic_Interaction"] = df_processed["Weather Quality"] * (10 - df_processed["Traffic Level"]) / 10
    
    # Temperature comfort score (optimal around 24-26°C)
    if "Temperature" in df_processed.columns:
        df_processed["Temp_Comfort"] = 10 - abs(df_processed["Temperature"] - 24) / 2
        df_processed["Temp_Comfort"] = df_processed["Temp_Comfort"].clip(0, 10)
    
    # Attractions density score
    if "Num Attractions" in df_processed.columns:
        max_attractions = df_processed["Num Attractions"].max()
        df_processed["Attractions_Score"] = df_processed["Num Attractions"] / max_attractions * 10
    
    # Create target variable with more nuanced definition
    df_processed["Destination Score"] = (
        (0.35 * df_processed["Weather Quality"]) + 
        (0.30 * (10 - df_processed["Traffic Level"])) + 
        (0.15 * df_processed["Year_round"]) +
        (0.10 * df_processed.get("Temp_Comfort", 5)) +
        (0.10 * df_processed.get("Attractions_Score", 5))
    )
    
    # Create classification target
    df_processed["Good Destination"] = (df_processed["Destination Score"] >= 6.5).astype(int)
    
    # Log class balance
    class_counts = df_processed["Good Destination"].value_counts()
    logger.info(f"Class distribution - Good: {class_counts.get(1, 0)}, Not Good: {class_counts.get(0, 0)}")
    
    logger.info(f"Feature engineering completed in {time.time() - start_time:.2f} seconds")
    
    return df_processed

# Perform feature selection to improve model performance
def select_features(df_processed):
    """Select most important features to reduce dimensionality"""
    # Basic features we always want to include
    base_features = ["Weather Quality", "Traffic Level", "Year_round"]
    
    # Additional features if they exist
    additional_features = [
        "Temperature", "Weather_Traffic_Interaction", 
        "Temp_Comfort", "Attractions_Score", "Num Attractions"
    ]
    
    # One-hot encoded features
    weather_condition_features = [col for col in df_processed.columns if col.startswith("Weather_Condition_")]
    city_type_features = [col for col in df_processed.columns if col.startswith("City_Type_")]
    
    # Combine all potential features
    potential_features = (
        base_features + 
        [f for f in additional_features if f in df_processed.columns] +
        weather_condition_features +
        city_type_features
    )
    
    # Check for high correlation between features
    if len(potential_features) > 1:
        corr_matrix = df_processed[potential_features].corr().abs()
        upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        
        # Find features with correlation > 0.85
        to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.85)]
        
        if to_drop:
            logger.info(f"Dropping highly correlated features: {to_drop}")
            potential_features = [f for f in potential_features if f not in to_drop]
    
    logger.info(f"Selected {len(potential_features)} features for model training")
    return potential_features

# Train Multiple Models with cross-validation and Select Best
def train_recommendation_models(df_processed, features):
    start_time = time.time()
    logger.info(f"🚀 Training models using {len(features)} features")
    
    # Prepare data
    X = df_processed[features].values
    y = df_processed["Good Destination"].values
    
    # Check for class imbalance
    class_counts = np.bincount(y)
    if len(class_counts) > 1 and min(class_counts) / sum(class_counts) < 0.2:
        logger.warning(f"Class imbalance detected: {class_counts}")
    
    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data with stratification to handle imbalanced classes
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Model dictionary with optimized parameters
    models = {
        "Random Forest": RandomForestClassifier(
            random_state=42,
            n_jobs=-1,  # Use all available cores
            class_weight='balanced' if len(class_counts) > 1 and min(class_counts) / sum(class_counts) < 0.3 else None
        ),
        "Gradient Boosting": GradientBoostingClassifier(
            random_state=42,
            validation_fraction=0.1,
            n_iter_no_change=5,
            tol=1e-4
        )
    }
    
    # Parameter grid for GridSearchCV
    param_grids = {
        "Random Forest": {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
            'max_features': ['sqrt', 'log2']
        },
        "Gradient Boosting": {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5],
            'subsample': [0.8, 1.0],
            'max_features': ['sqrt', 'log2']
        }
    }
    
    best_model = None
    best_score = 0
    results = {}
    
    # For small datasets, adjust the param grids
    if len(X_train) < 200:
        logger.info("Small dataset detected, simplifying parameter grid")
        for model_name in param_grids:
            for param in param_grids[model_name]:
                param_grids[model_name][param] = param_grids[model_name][param][:1] # Just use first value
    
    # Train and evaluate each model
    for name, model in models.items():
        model_start_time = time.time()
        logger.info(f"Training {name}...")
        
        # Use GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(
            model, param_grids[name], cv=min(5, len(X_train) // 10) if len(X_train) < 50 else 5, 
            scoring='f1_weighted', n_jobs=-1 if len(X_train) > 1000 else 1
        )
        
        try:
            grid_search.fit(X_train, y_train)
            best_model_params = grid_search.best_estimator_
            
            # Evaluate on test set
            y_pred = best_model_params.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
            
            # Generate detailed classification report
            report = classification_report(y_test, y_pred, target_names=['Not Good', 'Good'], output_dict=True)
            
            # Get feature importance
            if hasattr(best_model_params, 'feature_importances_'):
                feature_importance = best_model_params.feature_importances_
            else:
                feature_importance = np.zeros(len(features))
            
            results[name] = {
                'model': best_model_params,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'feature_importance': feature_importance,
                'best_params': grid_search.best_params_,
                'classification_report': report
            }
            
            logger.info(f"  {name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}, "
                      f"Time: {time.time() - model_start_time:.2f}s")
            
            # Track best model
            if f1 > best_score:
                best_score = f1
                best_model = best_model_params
        
        except Exception as e:
            logger.error(f"Error training {name}: {str(e)}")
    
    if not best_model:
        logger.error("Failed to train any models successfully")
        return None, None, features, scaler
    
    # Print feature importance for the best model
    logger.info("\n🔍 Feature Importance for Best Model:")
    best_model_name = max(results, key=lambda k: results[k]['f1'])
    importance = results[best_model_name]['feature_importance']
    
    feature_importance_dict = {}
    for i, feat in enumerate(features):
        logger.info(f"  {feat}: {importance[i]:.4f}")
        feature_importance_dict[feat] = importance[i]
    
    # Sort features by importance
    sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
    logger.info(f"Most important features: {sorted_features[:3]}")
    
    logger.info(f"Model training completed in {time.time() - start_time:.2f} seconds")
    
    return best_model, results, features, scaler

# Save Model with all necessary components
def save_enhanced_model(model, features, scaler, results=None, filename="travel_recommendation_enhanced.pkl"):
    if model is None:
        logger.error("No model to save")
        return False
    
    try:
        model_data = {
            "model": model,
            "features": features,
            "scaler": scaler,
            "version": "2.0",
            "training_date": datetime.now(),
            "results": results
        }
        
        with open(filename, "wb") as file:
            pickle.dump(model_data, file)
        
        logger.info(f"✅ Enhanced Model Saved as {filename}")
        
        # Also save as joblib file for faster loading
        joblib_filename = filename.replace('.pkl', '.joblib')
        joblib.dump(model_data, joblib_filename)
        logger.info(f"✅ Enhanced Model also saved as {joblib_filename}")
        
        return True
    except Exception as e:
        logger.error(f"Error saving model: {str(e)}")
        return False

# Generate Recommendations using the trained model
def generate_smart_recommendations(df, model, weather_data, traffic_data, features, scaler, num_recommendations=5):
    if model is None:
        logger.error("No model available for generating recommendations")
        return None
    
    logger.info("\n🎯 Generating Smart Recommendations...")
    
    try:
        # Engineer features for all cities
        df_processed = engineer_features(df, weather_data, traffic_data)
        
        # Check if all required features exist
        missing_features = [f for f in features if f not in df_processed.columns]
        if missing_features:
            logger.warning(f"Missing features in processed data: {missing_features}")
            # Create missing features with default values
            for feature in missing_features:
                df_processed[feature] = 0
        
        # Prepare feature matrix
        X = df_processed[features].values
        X_scaled = scaler.transform(X)
        
        # Get model predictions
        if hasattr(model, "predict_proba"):
            # If model supports probability, use it for ranking
            proba = model.predict_proba(X_scaled)
            df_processed["Recommendation Score"] = proba[:, 1]  # Probability of being a good destination
        else:
            # Fall back to decision function or manual scoring
            df_processed["Recommendation Score"] = model.predict(X_scaled).astype(float)
        
        # Calculate a weighted final score with confidence adjustment
        df_processed["Final Score"] = (
            (0.40 * df_processed["Weather Quality"] / 10) +  # Normalize to 0-1
            (0.25 * (10 - df_processed["Traffic Level"]) / 10) +  # Inverse and normalize
            (0.20 * df_processed["Recommendation Score"]) +  # Already 0-1
            (0.10 * df_processed["Year_round"]) +  # 0 or 1
            (0.05 * df_processed.get("Temp_Comfort", 5) / 10)  # Normalize to 0-1
        ) * 10  # Scale to 0-10
        
        # Get top recommendations
        recommendations = df_processed.sort_values(by="Final Score", ascending=False).head(num_recommendations)
        
        # Create a more detailed recommendation output
        columns_to_include = ["City", "Weather Quality", "Traffic Level", 
                             "Recommendation Score", "Final Score", "Year_round"]
        
        optional_columns = ["Temp_Comfort", "Attractions_Score", "Num Attractions", "Temperature"]
        for col in optional_columns:
            if col in df_processed.columns:
                columns_to_include.append(col)
        
        detailed_recommendations = recommendations[columns_to_include]
        
        # Round scores for cleaner display
        numeric_columns = detailed_recommendations.select_dtypes(include=[np.number]).columns
        for col in numeric_columns:
            detailed_recommendations[col] = detailed_recommendations[col].round(2)
        
        return detailed_recommendations
    
    except Exception as e:
        logger.error(f"Error generating recommendations: {str(e)}")
        return None

# Main Execution Pipeline with enhanced process and error handling
def main():
    start_time = time.time()
    logger.info("Starting Travel Recommendation System")
    
    df = load_data()
    if df is None:
        logger.error("Failed to load data. Exiting.")
        return False
    
    try:
        cities = df["City"].tolist()
        logger.info(f"Processing {len(cities)} cities")
        
        # Generate enhanced data
        weather_data = get_enhanced_weather_data(cities)
        traffic_data = get_enhanced_traffic_data(cities)
        
        # Engineer features
        df_processed = engineer_features(df, weather_data, traffic_data)
        
        # Select most important features
        selected_features = select_features(df_processed)
        
        # Train and select best model
        best_model, model_results, features, scaler = train_recommendation_models(df_processed, selected_features)
        
        if best_model is None:
            logger.error("Model training failed. Exiting.")
            return False
        
        # Save the enhanced model with all components
        save_success = save_enhanced_model(best_model, features, scaler, model_results)
        
        if not save_success:
            logger.warning("Failed to save model")
        
        # Generate recommendations
        recommendations = generate_smart_recommendations(
            df, best_model, weather_data, traffic_data, features, scaler
        )
        
        if recommendations is None:
            logger.error("Failed to generate recommendations")
            return False
        
        logger.info("\n🏙️ Top 5 Recommended Destinations:\n")
        logger.info(recommendations)
        
        # Visualize the top destinations
        logger.info("\n📊 Recommendation Analysis:")
        logger.info(f"  Best destination: {recommendations.iloc[0]['City']} with score {recommendations.iloc[0]['Final Score']:.2f}/10")
        
        weather_friendly = df_processed.sort_values('Weather Quality', ascending=False)['City'].values[0]
        logger.info(f"  Most weather-friendly: {weather_friendly}")
        
        least_crowded = df_processed.sort_values('Traffic Level')['City'].values[0]
        logger.info(f"  Least crowded: {least_crowded}")
        
        logger.info(f"Total execution time: {time.time() - start_time:.2f} seconds")
        return True
    
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        return False

if __name__ == "__main__":
    main()

2025-02-27 10:10:51,388 - INFO - Starting Travel Recommendation System
2025-02-27 10:10:51,396 - INFO - Removed 13 duplicate records
2025-02-27 10:10:51,399 - INFO - ✅ Dataset Loaded with 58 records.
2025-02-27 10:10:51,400 - INFO - Processing 58 cities
2025-02-27 10:10:51,403 - INFO - Generating new weather data
2025-02-27 10:10:51,405 - INFO - Generating new traffic data
2025-02-27 10:10:51,408 - INFO - Starting feature engineering...
2025-02-27 10:10:51,757 - INFO - Class distribution - Good: 5, Not Good: 53
2025-02-27 10:10:51,758 - INFO - Feature engineering completed in 0.35 seconds
2025-02-27 10:10:51,767 - INFO - Dropping highly correlated features: ['Weather_Traffic_Interaction', 'Num Attractions']
2025-02-27 10:10:51,768 - INFO - Selected 14 features for model training
2025-02-27 10:10:51,770 - INFO - 🚀 Training models using 14 features
2025-02-27 10:10:51,779 - INFO - Small dataset detected, simplifying parameter grid
2025-02-27 10:10:51,780 - INFO - Training Random Forest..

2025-02-27 10:10:53,352 - INFO -   Random Forest - Accuracy: 1.0000, F1: 1.0000, Time: 1.57s
2025-02-27 10:10:53,352 - INFO - Training Gradient Boosting...
2025-02-27 10:10:53,641 - INFO -   Gradient Boosting - Accuracy: 0.9167, F1: 0.9286, Time: 0.29s
2025-02-27 10:10:53,641 - INFO - 
🔍 Feature Importance for Best Model:
2025-02-27 10:10:53,641 - INFO -   Weather Quality: 0.3251
2025-02-27 10:10:53,641 - INFO -   Traffic Level: 0.1658
2025-02-27 10:10:53,653 - INFO -   Year_round: 0.0161
2025-02-27 10:10:53,655 - INFO -   Temperature: 0.1038
2025-02-27 10:10:53,655 - INFO -   Temp_Comfort: 0.1059
2025-02-27 10:10:53,658 - INFO -   Attractions_Score: 0.0657
2025-02-27 10:10:53,658 - INFO -   Weather_Condition_Clear: 0.0154
2025-02-27 10:10:53,658 - INFO -   Weather_Condition_Cloudy: 0.0093
2025-02-27 10:10:53,658 - INFO -   Weather_Condition_Rainy: 0.0080
2025-02-27 10:10:53,666 - INFO -   Weather_Condition_Sunny: 0.0733
2025-02-27 10:10:53,667 - INFO -   City_Type_metropolitan: 0.0642

2025-02-27 10:32:34,181 - INFO - Starting Travel Recommendation System
2025-02-27 10:32:34,206 - INFO - Removed 13 duplicate records
2025-02-27 10:32:34,214 - INFO - ✅ Dataset Loaded with 58 records.
2025-02-27 10:32:34,216 - INFO - Processing 58 cities
2025-02-27 10:32:34,217 - INFO - Generating new weather data
2025-02-27 10:32:34,219 - INFO - Generating new traffic data
2025-02-27 10:32:34,221 - INFO - Starting feature engineering...


2025-02-27 10:32:34,414 - INFO - Class distribution - Good: 5, Not Good: 53
2025-02-27 10:32:34,414 - INFO - Feature engineering completed in 0.19 seconds
2025-02-27 10:32:34,422 - INFO - Dropping highly correlated features: ['Weather_Traffic_Interaction', 'Num Attractions']
2025-02-27 10:32:34,422 - INFO - Selected 14 features for model training
2025-02-27 10:32:34,424 - INFO - 🚀 Training models using 14 features
2025-02-27 10:32:34,437 - INFO - Small dataset detected, simplifying parameter grid
2025-02-27 10:32:34,437 - INFO - Training Random Forest...
2025-02-27 10:32:35,869 - INFO -   Random Forest - Accuracy: 1.0000, F1: 1.0000, Time: 1.43s
2025-02-27 10:32:35,870 - INFO - Training Gradient Boosting...
2025-02-27 10:32:36,352 - INFO -   Gradient Boosting - Accuracy: 0.9167, F1: 0.9286, Time: 0.48s
2025-02-27 10:32:36,353 - INFO - 
🔍 Feature Importance for Best Model:
2025-02-27 10:32:36,353 - INFO -   Weather Quality: 0.3251
2025-02-27 10:32:36,355 - INFO -   Traffic Level: 0.1658

Model -2 (User_Preferences)

In [None]:
import pandas as pd
import pickle
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.preprocessing import StandardScaler
import random
import joblib
import logging
from datetime import datetime
import time

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler("model_training.log"), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# API Keys - Store in environment variables for better security
API_KEY_WEATHER = os.environ.get('WEATHER_API_KEY', "ae17ccc0bd407c2a7a09e95fa78d1d2d")

# Load Dataset with better error handling
def load_data(file_path="holidify.csv"):
    try:
        if not os.path.exists(file_path):
            logger.error(f"File not found: {file_path}")
            return None
            
        df = pd.read_csv(file_path)
        
        # Check if required columns exist
        required_columns = ['City', 'Best Time to visit']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            logger.error(f"Missing required columns: {missing_columns}")
            return None
            
        # Drop duplicates and clean the data
        original_count = len(df)
        df = df.drop_duplicates(subset=['City'])
        logger.info(f"Removed {original_count - len(df)} duplicate records")
        
        df['City'] = df['City'].str.strip()
        
        # Fill missing values in categorical columns
        missing_best_time = df['Best Time to visit'].isna().sum()
        if missing_best_time > 0:
            logger.info(f"Filling {missing_best_time} missing values in 'Best Time to visit'")
            df['Best Time to visit'] = df['Best Time to visit'].fillna('Throughout the year')
        
        # Add columns for feature engineering
        df['Year_round'] = df['Best Time to visit'].str.contains('Throughout the year').astype(int)
        
        logger.info(f"✅ Dataset Loaded with {len(df)} records.")
        return df
    except Exception as e:
        logger.error(f"❌ Error loading data: {str(e)}")
        return None

# Cache for weather and traffic data to avoid regenerating during testing
_weather_cache = {}
_traffic_cache = {}

# Enhanced Weather Data Generation with more realistic distributions and caching
def get_enhanced_weather_data(cities, use_cache=True):
    global _weather_cache
    
    # Return cached data if available and requested
    if use_cache and _weather_cache and set(cities).issubset(set(_weather_cache.keys())):
        logger.info("Using cached weather data")
        return {city: _weather_cache[city] for city in cities}
    
    logger.info("Generating new weather data")
    weather_data = {}
    # More realistic weather conditions with seasonal variation
    conditions = {
        "Clear": {"quality_range": (7, 10), "prob": 0.4, "temp_range": (18, 28)},
        "Cloudy": {"quality_range": (5, 8), "prob": 0.3, "temp_range": (15, 25)},
        "Rainy": {"quality_range": (3, 6), "prob": 0.2, "temp_range": (12, 22)},
        "Sunny": {"quality_range": (8, 10), "prob": 0.1, "temp_range": (22, 35)}
    }
    
    # Use a fixed seed for reproducibility during testing
    random.seed(42)
    
    for city in cities:
        # Use weighted random choice for condition
        condition = random.choices(
            list(conditions.keys()), 
            weights=[cond['prob'] for cond in conditions.values()], 
            k=1
        )[0]
        
        # Get quality range for the selected condition
        quality_range = conditions[condition]["quality_range"]
        temp_range = conditions[condition]["temp_range"]
        
        quality = random.randint(quality_range[0], quality_range[1])
        
        weather_data[city] = {
            "quality": quality,
            "condition": condition,
            "temp": random.randint(temp_range[0], temp_range[1])
        }
    
    # Update cache
    _weather_cache.update(weather_data)
    
    # Reset random seed
    random.seed(None)
    
    return weather_data

# Enhanced Traffic Data with seasonal variation and caching
def get_enhanced_traffic_data(cities, use_cache=True):
    global _traffic_cache
    
    # Return cached data if available and requested
    if use_cache and _traffic_cache and set(cities).issubset(set(_traffic_cache.keys())):
        logger.info("Using cached traffic data")
        return {city: _traffic_cache[city] for city in cities}
    
    logger.info("Generating new traffic data")
    traffic_data = {}
    
    # Define different city types with different traffic distributions
    city_types = {
        "metropolitan": {"traffic_range": (6, 10), "prob": 0.3, "attractions": (8, 15)},
        "urban": {"traffic_range": (4, 8), "prob": 0.4, "attractions": (6, 10)},
        "suburban": {"traffic_range": (2, 6), "prob": 0.2, "attractions": (4, 8)},
        "rural": {"traffic_range": (1, 4), "prob": 0.1, "attractions": (2, 5)}
    }
    
    # Use a fixed seed for reproducibility during testing
    random.seed(42)
    
    for city in cities:
        # Assign city type based on probability
        city_type = random.choices(
            list(city_types.keys()), 
            weights=[type_data['prob'] for type_data in city_types.values()], 
            k=1
        )[0]
        
        traffic_range = city_types[city_type]["traffic_range"]
        base_traffic = random.randint(traffic_range[0], traffic_range[1])
        
        # Number of attractions based on city type
        num_attractions = random.randint(*city_types[city_type]["attractions"])
        
        # Create more realistic travel times for attractions based on traffic level
        places = {}
        for i in range(1, num_attractions + 1):
            # More congested cities have longer travel times with higher variance
            min_time = 10 + (base_traffic * 2)
            max_time = 20 + (base_traffic * 5)
            places[f"{city} Place {i}"] = random.randint(min_time, max_time)
        
        traffic_data[city] = {
            "traffic_level": base_traffic,
            "city_type": city_type,
            "places": places,
            "num_attractions": num_attractions
        }
    
    # Update cache
    _traffic_cache.update(traffic_data)
    
    # Reset random seed
    random.seed(None)
    
    return traffic_data

# Feature Engineering Function with more advanced features
def engineer_features(df, weather_data, traffic_data):
    """Create additional features for better predictions"""
    start_time = time.time()
    logger.info("Starting feature engineering...")
    
    # Create copies of data to avoid modifying original
    df_processed = df.copy()
    
    # Merge Weather & Traffic Data
    for city in df_processed["City"]:
        if city in weather_data:
            df_processed.loc[df_processed["City"] == city, "Weather Quality"] = weather_data[city]["quality"]
            df_processed.loc[df_processed["City"] == city, "Temperature"] = weather_data[city].get("temp", 25)
            df_processed.loc[df_processed["City"] == city, "Weather Condition"] = weather_data[city].get("condition", "Unknown")
        if city in traffic_data:
            df_processed.loc[df_processed["City"] == city, "Traffic Level"] = traffic_data[city]["traffic_level"]
            df_processed.loc[df_processed["City"] == city, "City Type"] = traffic_data[city].get("city_type", "urban")
            df_processed.loc[df_processed["City"] == city, "Num Attractions"] = traffic_data[city].get("num_attractions", 5)

    # Fill Missing Values with more intelligent defaults based on similar cities
    # For weather quality, group by weather condition
    if "Weather Condition" in df_processed.columns and "Weather Quality" in df_processed.columns:
        condition_medians = df_processed.groupby("Weather Condition")["Weather Quality"].median().to_dict()
        
        # For each missing value, fill with the median of that condition
        for condition, median in condition_medians.items():
            mask = (df_processed["Weather Quality"].isna()) & (df_processed["Weather Condition"] == condition)
            df_processed.loc[mask, "Weather Quality"] = median
    
    # Fill remaining missing values with medians
    for col in ["Weather Quality", "Traffic Level", "Temperature", "Num Attractions"]:
        if col in df_processed.columns:
            missing_count = df_processed[col].isna().sum()
            if missing_count > 0:
                logger.info(f"Filling {missing_count} missing values in '{col}'")
                df_processed[col] = df_processed[col].fillna(df_processed[col].median())
    
    # One-hot encode categorical features
    categorical_cols = ["Weather Condition", "City Type"]
    for col in categorical_cols:
        if col in df_processed.columns:
            df_processed = pd.get_dummies(df_processed, columns=[col], prefix=col.replace(" ", "_"))
    
    # Create interaction features
    df_processed["Weather_Traffic_Interaction"] = df_processed["Weather Quality"] * (10 - df_processed["Traffic Level"]) / 10
    
    # Temperature comfort score (optimal around 24-26°C)
    if "Temperature" in df_processed.columns:
        df_processed["Temp_Comfort"] = 10 - abs(df_processed["Temperature"] - 24) / 2
        df_processed["Temp_Comfort"] = df_processed["Temp_Comfort"].clip(0, 10)
    
    # Attractions density score
    if "Num Attractions" in df_processed.columns:
        max_attractions = df_processed["Num Attractions"].max()
        df_processed["Attractions_Score"] = df_processed["Num Attractions"] / max_attractions * 10
    
    # Create target variable with more nuanced definition
    df_processed["Destination Score"] = (
        (0.35 * df_processed["Weather Quality"]) + 
        (0.30 * (10 - df_processed["Traffic Level"])) + 
        (0.15 * df_processed["Year_round"]) +
        (0.10 * df_processed.get("Temp_Comfort", 5)) +
        (0.10 * df_processed.get("Attractions_Score", 5))
    )
    
    # Create classification target
    df_processed["Good Destination"] = (df_processed["Destination Score"] >= 6.5).astype(int)
    
    # Log class balance
    class_counts = df_processed["Good Destination"].value_counts()
    logger.info(f"Class distribution - Good: {class_counts.get(1, 0)}, Not Good: {class_counts.get(0, 0)}")
    
    logger.info(f"Feature engineering completed in {time.time() - start_time:.2f} seconds")
    
    return df_processed

# Perform feature selection to improve model performance
def select_features(df_processed):
    """Select most important features to reduce dimensionality"""
    # Basic features we always want to include
    base_features = ["Weather Quality", "Traffic Level", "Year_round"]
    
    # Additional features if they exist
    additional_features = [
        "Temperature", "Weather_Traffic_Interaction", 
        "Temp_Comfort", "Attractions_Score", "Num Attractions"
    ]
    
    # One-hot encoded features
    weather_condition_features = [col for col in df_processed.columns if col.startswith("Weather_Condition_")]
    city_type_features = [col for col in df_processed.columns if col.startswith("City_Type_")]
    
    # Combine all potential features
    potential_features = (
        base_features + 
        [f for f in additional_features if f in df_processed.columns] +
        weather_condition_features +
        city_type_features
    )
    
    # Check for high correlation between features
    if len(potential_features) > 1:
        corr_matrix = df_processed[potential_features].corr().abs()
        upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        
        # Find features with correlation > 0.85
        to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.85)]
        
        if to_drop:
            logger.info(f"Dropping highly correlated features: {to_drop}")
            potential_features = [f for f in potential_features if f not in to_drop]
    
    logger.info(f"Selected {len(potential_features)} features for model training")
    return potential_features

# Train Multiple Models with cross-validation and Select Best
def train_recommendation_models(df_processed, features):
    start_time = time.time()
    logger.info(f"🚀 Training models using {len(features)} features")
    
    # Prepare data
    X = df_processed[features].values
    y = df_processed["Good Destination"].values
    
    # Check for class imbalance
    class_counts = np.bincount(y)
    if len(class_counts) > 1 and min(class_counts) / sum(class_counts) < 0.2:
        logger.warning(f"Class imbalance detected: {class_counts}")
    
    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Split data with stratification to handle imbalanced classes
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Model dictionary with optimized parameters
    models = {
        "Random Forest": RandomForestClassifier(
            random_state=42,
            n_jobs=-1,  # Use all available cores
            class_weight='balanced' if len(class_counts) > 1 and min(class_counts) / sum(class_counts) < 0.3 else None
        ),
        "Gradient Boosting": GradientBoostingClassifier(
            random_state=42,
            validation_fraction=0.1,
            n_iter_no_change=5,
            tol=1e-4
        )
    }
    
    # Parameter grid for GridSearchCV
    param_grids = {
        "Random Forest": {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2],
            'max_features': ['sqrt', 'log2']
        },
        "Gradient Boosting": {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5],
            'subsample': [0.8, 1.0],
            'max_features': ['sqrt', 'log2']
        }
    }
    
    best_model = None
    best_score = 0
    results = {}
    
    # For small datasets, adjust the param grids
    if len(X_train) < 200:
        logger.info("Small dataset detected, simplifying parameter grid")
        for model_name in param_grids:
            for param in param_grids[model_name]:
                param_grids[model_name][param] = param_grids[model_name][param][:1] # Just use first value
    
    # Train and evaluate each model
    for name, model in models.items():
        model_start_time = time.time()
        logger.info(f"Training {name}...")
        
        # Use GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(
            model, param_grids[name], cv=min(5, len(X_train) // 10) if len(X_train) < 50 else 5, 
            scoring='f1_weighted', n_jobs=-1 if len(X_train) > 1000 else 1
        )
        
        try:
            grid_search.fit(X_train, y_train)
            best_model_params = grid_search.best_estimator_
            
            # Evaluate on test set
            y_pred = best_model_params.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
            
            # Generate detailed classification report
            report = classification_report(y_test, y_pred, target_names=['Not Good', 'Good'], output_dict=True)
            
            # Get feature importance
            if hasattr(best_model_params, 'feature_importances_'):
                feature_importance = best_model_params.feature_importances_
            else:
                feature_importance = np.zeros(len(features))
            
            results[name] = {
                'model': best_model_params,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'feature_importance': feature_importance,
                'best_params': grid_search.best_params_,
                'classification_report': report
            }
            
            logger.info(f"  {name} - Accuracy: {accuracy:.4f}, F1: {f1:.4f}, "
                      f"Time: {time.time() - model_start_time:.2f}s")
            
            # Track best model
            if f1 > best_score:
                best_score = f1
                best_model = best_model_params
        
        except Exception as e:
            logger.error(f"Error training {name}: {str(e)}")
    
    if not best_model:
        logger.error("Failed to train any models successfully")
        return None, None, features, scaler
    
    # Print feature importance for the best model
    logger.info("\n🔍 Feature Importance for Best Model:")
    best_model_name = max(results, key=lambda k: results[k]['f1'])
    importance = results[best_model_name]['feature_importance']
    
    feature_importance_dict = {}
    for i, feat in enumerate(features):
        logger.info(f"  {feat}: {importance[i]:.4f}")
        feature_importance_dict[feat] = importance[i]
    
    # Sort features by importance
    sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
    logger.info(f"Most important features: {sorted_features[:3]}")
    
    logger.info(f"Model training completed in {time.time() - start_time:.2f} seconds")
    
    return best_model, results, features, scaler

# Save Model with all necessary components
def save_enhanced_model(model, features, scaler, results=None, filename="travel_recommendation_enhanced.pkl"):
    if model is None:
        logger.error("No model to save")
        return False
    
    try:
        model_data = {
            "model": model,
            "features": features,
            "scaler": scaler,
            "version": "2.0",
            "training_date": datetime.now(),
            "results": results
        }
        
        with open(filename, "wb") as file:
            pickle.dump(model_data, file)
        
        logger.info(f"✅ Enhanced Model Saved as {filename}")
        
        # Also save as joblib file for faster loading
        joblib_filename = filename.replace('.pkl', '.joblib')
        joblib.dump(model_data, joblib_filename)
        logger.info(f"✅ Enhanced Model also saved as {joblib_filename}")
        
        return True
    except Exception as e:
        logger.error(f"Error saving model: {str(e)}")
        return False

# Load a saved model
def load_model(filename="travel_recommendation_enhanced.joblib"):
    try:
        # Try joblib first (faster loading)
        if os.path.exists(filename):
            model_data = joblib.load(filename)
            logger.info(f"✅ Model loaded successfully from {filename}")
        # Fall back to pickle if joblib file not found
        elif os.path.exists(filename.replace('.joblib', '.pkl')):
            with open(filename.replace('.joblib', '.pkl'), "rb") as file:
                model_data = pickle.load(file)
            logger.info(f"✅ Model loaded successfully from {filename.replace('.joblib', '.pkl')}")
        else:
            logger.error(f"Model file not found: {filename}")
            return None
            
        # Check that model_data has expected keys
        required_keys = ["model", "features", "scaler"]
        missing_keys = [key for key in required_keys if key not in model_data]
        
        if missing_keys:
            logger.error(f"Model data missing required keys: {missing_keys}")
            return None
            
        return model_data
    except Exception as e:
        logger.error(f"Error loading model: {str(e)}")
        return None

# Generate Recommendations using the trained model
def generate_smart_recommendations(df, model, weather_data, traffic_data, features, scaler, num_recommendations=5):
    if model is None:
        logger.error("No model available for generating recommendations")
        return None
    
    logger.info("\n🎯 Generating Smart Recommendations...")
    
    try:
        # Engineer features for all cities
        df_processed = engineer_features(df, weather_data, traffic_data)
        
        # Check if all required features exist
        missing_features = [f for f in features if f not in df_processed.columns]
        if missing_features:
            logger.warning(f"Missing features in processed data: {missing_features}")
            # Create missing features with default values
            for feature in missing_features:
                df_processed[feature] = 0
        
        # Prepare feature matrix
        X = df_processed[features].values
        X_scaled = scaler.transform(X)
        
        # Get model predictions
        if hasattr(model, "predict_proba"):
            # If model supports probability, use it for ranking
            proba = model.predict_proba(X_scaled)
            df_processed["Recommendation Score"] = proba[:, 1]  # Probability of being a good destination
        else:
            # Fall back to decision function or manual scoring
            df_processed["Recommendation Score"] = model.predict(X_scaled).astype(float)
        
        # Calculate a weighted final score with confidence adjustment
        df_processed["Final Score"] = (
            (0.40 * df_processed["Weather Quality"] / 10) +  # Normalize to 0-1
            (0.25 * (10 - df_processed["Traffic Level"]) / 10) +  # Inverse and normalize
            (0.20 * df_processed["Recommendation Score"]) +  # Already 0-1
            (0.10 * df_processed["Year_round"]) +  # 0 or 1
            (0.05 * df_processed.get("Temp_Comfort", 5) / 10)  # Normalize to 0-1
        ) * 10  # Scale to 0-10
        
        # Get top recommendations
        recommendations = df_processed.sort_values(by="Final Score", ascending=False).head(num_recommendations)
        
        # Create a more detailed recommendation output
        columns_to_include = ["City", "Weather Quality", "Traffic Level", 
                             "Recommendation Score", "Final Score", "Year_round"]
        
        optional_columns = ["Temp_Comfort", "Attractions_Score", "Num Attractions", "Temperature"]
        for col in optional_columns:
            if col in df_processed.columns:
                columns_to_include.append(col)
        
        detailed_recommendations = recommendations[columns_to_include]
        
        # Round scores for cleaner display
        numeric_columns = detailed_recommendations.select_dtypes(include=[np.number]).columns
        for col in numeric_columns:
            detailed_recommendations[col] = detailed_recommendations[col].round(2)
        
        return detailed_recommendations
    
    except Exception as e:
        logger.error(f"Error generating recommendations: {str(e)}")
        return None

# NEW: Function to get similar cities based on a reference city
def get_similar_cities(reference_city, df, weather_data, traffic_data, features, scaler, top_n=5):
    try:
        if reference_city not in df["City"].values:
            logger.error(f"Reference city '{reference_city}' not found in dataset")
            return None
        
        # Engineer features
        df_processed = engineer_features(df, weather_data, traffic_data)
        
        # Fill missing features
        missing_features = [f for f in features if f not in df_processed.columns]
        for feature in missing_features:
            df_processed[feature] = 0
        
        # Get feature values for reference city
        reference_vector = df_processed.loc[df_processed["City"] == reference_city, features].values[0]
        reference_scaled = scaler.transform([reference_vector])[0]
        
        # Calculate distances for all cities
        distances = []
        for idx, row in df_processed.iterrows():
            if row["City"] != reference_city:  # Skip the reference city itself
                city_vector = row[features].values
                city_scaled = scaler.transform([city_vector])[0]
                
                # Calculate Euclidean distance
                distance = np.sqrt(np.sum((reference_scaled - city_scaled)**2))
                distances.append((row["City"], distance))
        
        # Sort by distance and get top N
        similar_cities = sorted(distances, key=lambda x: x[1])[:top_n]
        
        # Create dataframe of similar cities with details
        similar_df = pd.DataFrame([
            {"City": city, "Similarity Score": round(10 - dist, 2)} 
            for city, dist in similar_cities
        ])
        
        return similar_df
    
    except Exception as e:
        logger.error(f"Error finding similar cities: {str(e)}")
        return None

# NEW: User preferences based recommendation
def recommend_based_on_preferences(df, model_data, user_preferences, num_recommendations=5):
    """
    Generate recommendations based on user preferences
    
    Parameters:
    df (pandas.DataFrame): Original city data
    model_data (dict): Loaded model data containing model, features, and scaler
    user_preferences (dict): Dictionary with user preferences like:
        - weather_importance (float): 0-1 importance of good weather
        - crowd_importance (float): 0-1 importance of less crowds
        - attractions_importance (float): 0-1 importance of attractions
        - season (str): Preferred season ('Summer', 'Winter', 'Spring', 'Fall')
    
    Returns:
    pandas.DataFrame: Top recommendations based on user preferences
    """
    try:
        model = model_data["model"]
        features = model_data["features"]
        scaler = model_data["scaler"]
        
        # Get base data
        cities = df["City"].tolist()
        weather_data = get_enhanced_weather_data(cities)
        traffic_data = get_enhanced_traffic_data(cities)
        
        # Process data
        df_processed = engineer_features(df, weather_data, traffic_data)
        
        # Check if all required features exist
        missing_features = [f for f in features if f not in df_processed.columns]
        for feature in missing_features:
            df_processed[feature] = 0
        
        # Prepare feature matrix for model prediction
        X = df_processed[features].values
        X_scaled = scaler.transform(X)
        
        # Get model predictions
        if hasattr(model, "predict_proba"):
            proba = model.predict_proba(X_scaled)
            df_processed["Recommendation Score"] = proba[:, 1]
        else:
            df_processed["Recommendation Score"] = model.predict(X_scaled).astype(float)
        
        # Apply user preferences to adjust scores
        # Normalize importance values to ensure they sum to 1
        weather_importance = user_preferences.get("weather_importance", 0.3)
        crowd_importance = user_preferences.get("crowd_importance", 0.3)
        attractions_importance = user_preferences.get("attractions_importance", 0.2)
        year_round_importance = user_preferences.get("year_round_importance", 0.1)
        temp_importance = user_preferences.get("temperature_importance", 0.1)
        
        # Adjust for season preference if specified
        season = user_preferences.get("season", None)
        if season:
            # Create a season score based on best time to visit
            df_processed["Season Match"] = df["Best Time to visit"].str.contains(season, case=False).astype(float)
            season_importance = 0.2  # Give season preference significant weight
            
            # Normalize other weights
            total = weather_importance + crowd_importance + attractions_importance + year_round_importance + temp_importance
            factor = (1 - season_importance) / total
            
            weather_importance *= factor
            crowd_importance *= factor
            attractions_importance *= factor
            year_round_importance *= factor
            temp_importance *= factor
            
            # Include season in final score calculation
            df_processed["Final Score"] = (
                (weather_importance * df_processed["Weather Quality"] / 10) +
                (crowd_importance * (10 - df_processed["Traffic Level"]) / 10) +
                (attractions_importance * df_processed.get("Attractions_Score", 5) / 10) +
                (year_round_importance * df_processed["Year_round"]) +
                (temp_importance * df_processed.get("Temp_Comfort", 5) / 10) +
                (season_importance * df_processed["Season Match"])
            ) * 10
        else:
            # No season preference, use standard weights
            df_processed["Final Score"] = (
                (weather_importance * df_processed["Weather Quality"] / 10) +
                (crowd_importance * (10 - df_processed["Traffic Level"]) / 10) +
                (attractions_importance * df_processed.get("Attractions_Score", 5) / 10) +
                (year_round_importance * df_processed["Year_round"]) +
                (temp_importance * df_processed.get("Temp_Comfort", 5) / 10)
            ) * 10
        
        # Get top recommendations
        recommendations = df_processed.sort_values(by="Final Score", ascending=False).head(num_recommendations)
        
        # Create a more detailed recommendation output
        columns_to_include = ["City", "Weather Quality", "Traffic Level", 
                             "Recommendation Score", "Final Score", "Year_round"]
        
        optional_columns = ["Temp_Comfort", "Attractions_Score", "Num Attractions", "Temperature"]
        for col in optional_columns:
            if col in df_processed.columns:
                columns_to_include.append(col)
        
        detailed_recommendations = recommendations[columns_to_include]
        
        # Round scores for cleaner display
        numeric_columns = detailed_recommendations.select_dtypes(include=[np.number]).columns
        for col in numeric_columns:
            detailed_recommendations[col] = detailed_recommendations[col].round(2)
        
        return detailed_recommendations
    
    except Exception as e:
        logger.error(f"Error generating recommendations: {str(e)}")
        return None

# Main Execution Pipeline with enhanced process and error handling
def main():
    start_time = time.time()
    logger.info("Starting Travel Recommendation System")
    
    df = load_data()
    if df is None:
        logger.error("Failed to load data. Exiting.")
        return False
    
    try:
        cities = df["City"].tolist()
        logger.info(f"Processing {len(cities)} cities")
        
        # Generate enhanced data
        weather_data = get_enhanced_weather_data(cities)
        traffic_data = get_enhanced_traffic_data(cities)
        
        # Engineer features
        df_processed = engineer_features(df, weather_data, traffic_data)
        
        # Select most important features
        selected_features = select_features(df_processed)
        
        # Train and select best model
        best_model, model_results, features, scaler = train_recommendation_models(df_processed, selected_features)
        
        if best_model is None:
            logger.error("Model training failed. Exiting.")
            return False
        
        # Save the enhanced model with all components
        save_success = save_enhanced_model(best_model, features, scaler, model_results)
        
        if not save_success:
            logger.warning("Failed to save model")
        
        # Generate recommendations
        recommendations = generate_smart_recommendations(
            df, best_model, weather_data, traffic_data, features, scaler
        )
        
        if recommendations is None:
            logger.error("Failed to generate recommendations")
            return False
        
        logger.info("\n🏙️ Top 5 Recommended Destinations:\n")
        logger.info(recommendations)
        
        # Visualize the top destinations
        logger.info("\n📊 Recommendation Analysis:")
        logger.info(f"  Best destination: {recommendations.iloc[0]['City']} with score {recommendations.iloc[0]['Final Score']:.2f}/10")
        
        weather_friendly = df_processed.sort_values('Weather Quality', ascending=False)['City'].values[0]
        logger.info(f"  Most weather-friendly: {weather_friendly}")
        
        least_crowded = df_processed.sort_values('Traffic Level')['City'].values[0]
        logger.info(f"  Least crowded: {least_crowded}")
        
        logger.info(f"Total execution time: {time.time() - start_time:.2f} seconds")
        return True
    
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        return False

if __name__ == "__main__":
    main()