In [22]:
import pandas as pd
import numpy as np

In [23]:
flights = pd.read_csv('/Users/zayd/Downloads/F20EC CW2/AirFares_from_Dubai 3.csv')

# Data Preprocessing

In [24]:
flights.head(1)

Unnamed: 0,name,continent,country,destinationName,flightNumber,airlineCode_iata,destination_iata,airlineName,aircraftTerminal,Duration,Cost,Climate,Date,Time,Average_Cost
0,Houari Boumediene Airport,Africa,Algeria,Algiers,AH 4063,AH,ALG,Air Algerie,1,7h 35m,AED 2055,Hot,6/10/2024,3:59,low


In [25]:
flights.rename(columns={'name': 'Airport', 'continent':'Continent', 'country': 'Country', 'destinationName':'Destination', 'airlineName': 'Airline', 'flightNumber':'Flight Number', 'aircraftTerminal':'Airport Terminal', 'Duration':'Flight Duration', 'Cost':'Flight Cost', 'Average_Cost':'Destination Expenditure'}, inplace=True)

In [26]:
flights['Climate'].unique()

array(['Hot', 'Temperate cool', 'Warm Mildly humid', 'Tropical cool',
       'Tropical Temperate'], dtype=object)

In [27]:
flights.columns

Index(['Airport', 'Continent', 'Country', 'Destination', 'Flight Number',
       'airlineCode_iata', 'destination_iata', 'Airline', 'Airport Terminal',
       'Flight Duration', 'Flight Cost', 'Climate', 'Date', 'Time',
       'Destination Expenditure'],
      dtype='object')

In [28]:
flights['Destination Expenditure'].unique()

array(['low', 'Medium', 'medium', 'high', 'High'], dtype=object)

In [29]:
flights['Destination Expenditure'] = flights['Destination Expenditure'].replace('low', 'Low')
flights['Destination Expenditure'] = flights['Destination Expenditure'].replace('medium', 'Medium')
flights['Destination Expenditure'] = flights['Destination Expenditure'].replace('high', 'High')

In [30]:
flights['Destination Expenditure'].unique()

array(['Low', 'Medium', 'High'], dtype=object)

In [31]:
flights.isna().sum()

Airport                    0
Continent                  0
Country                    0
Destination                0
Flight Number              0
airlineCode_iata           0
destination_iata           0
Airline                    0
Airport Terminal           0
Flight Duration            0
Flight Cost                0
Climate                    0
Date                       0
Time                       0
Destination Expenditure    0
dtype: int64

In [32]:
flights.dtypes

Airport                    object
Continent                  object
Country                    object
Destination                object
Flight Number              object
airlineCode_iata           object
destination_iata           object
Airline                    object
Airport Terminal            int64
Flight Duration            object
Flight Cost                object
Climate                    object
Date                       object
Time                       object
Destination Expenditure    object
dtype: object

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
def flight_recommendation():

    # Retrieving input for user preferences
    user_climate_preference = int(input("Select your preferred climate:\n (1) Hot \n (2) Warm Mildly Humid \n (3) Tropical Temperate \n (4) Tropical Cool \n (5) Temperate Cool"))
    if user_climate_preference == 1:
        climate = "Hot"
    elif user_climate_preference == 2:
        climate = "Warm Mildly humid"
    elif user_climate_preference == 3:
        climate = "Tropical Temperate"
    elif user_climate_preference == 4:
        climate = "Tropical cool"
    elif user_climate_preference == 5:
        climate = "Temperate cool"

    user_expenditure_preference = int(input("Select your preferred destination expenditure range in USD:\n (1) 0-1000 USD \n (2) 1000-2000 USD \n (3) 2000+ USD"))
    if user_expenditure_preference == 1:
        expenditure = "Low"
    elif user_expenditure_preference == 2:
        expenditure = "Medium"
    elif user_expenditure_preference == 3:
        expenditure = "High"
    
    num_flights = int(input("Enter number of flight you need (Min:1, Max:5): "))

    # Filtering flights based on user preferences
    filtered_flights = flights[(flights['Climate'] == climate) & (flights['Destination Expenditure'] == expenditure)]

    if filtered_flights.empty: 
        print("Sorry, no flights were found according to your preferences.")
    
    else:

        # Combining user preferences into a single string for vectorization
        filtered_flights['combined_features'] = filtered_flights[['Climate', 'Destination Expenditure']].apply(lambda x: ' '.join(x.astype(str)), axis=1)

        # Initializing the TF-IDF Vectorizer
        tfidf_vectorizer = TfidfVectorizer()

        # Fiting and transforming the combined user preferences on the filtered flights
        tfidf_matrix_filtered = tfidf_vectorizer.fit_transform(filtered_flights['combined_features'])
        user_query = f"{climate} {expenditure}"
        user_pref_vector = tfidf_vectorizer.transform([user_query])

        # Calculating cosine similarity
        cosine_sim = cosine_similarity(user_pref_vector, tfidf_matrix_filtered)
    
        # Retrieving cosine similarity scores for all destinations and sorting them
        sim_scores = list(enumerate(cosine_sim[0]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
        # Retrieving indices of unique recommendations
        recommended_indexes = []
        seen = set()
        for i, _ in sim_scores:
            destination = filtered_flights.iloc[i]['Destination']
            flight_number = filtered_flights.iloc[i]['Flight Number']
        
            if (destination, flight_number) not in seen:
                recommended_indexes.append(i)
                seen.add((destination, flight_number))
        
            if len(recommended_indexes) >= num_flights:
                break
    
        # Returning top N unique recommendations (based on num_flights provided by user)
        recommendations = filtered_flights.iloc[recommended_indexes]
        final_recommendations = recommendations[['Airline', 'Flight Number', 'Airport', 'Airport Terminal', 'Destination', 'Country', 'Date', 'Flight Duration', 'Flight Cost', 'Destination Expenditure']]
    
        if final_recommendations.shape[0]==0:
            print("Sorry, no flights were found based on your preferences.")

        elif final_recommendations.shape[0] < num_flights:
            if final_recommendations.shape[0] == 1:
                print("Only the following 1 flight was found based on your preferences :-")
                return final_recommendations
            else:
                print("Only the following", final_recommendations.shape[0], "flights were found based on your preferences :-")
                return final_recommendations
        else:
            print("The following", num_flights, "flights were found based on your preferences :-")
            return final_recommendations

In [35]:
flight_recommendation()

The following 2 flights were found based on your preferences :-


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_flights['combined_features'] = filtered_flights[['Climate', 'Destination Expenditure']].apply(lambda x: ' '.join(x.astype(str)), axis=1)


Unnamed: 0,Airline,Flight Number,Airport,Airport Terminal,Destination,Country,Date,Flight Duration,Flight Cost,Destination Expenditure
0,Air Algerie,AH 4063,Houari Boumediene Airport,1,Algiers,Algeria,6/10/2024,7h 35m,AED 2055,Low
1,Emirates,EK 757,Houari Boumediene Airport,3,Algiers,Algeria,8/12/2024,7h 35m,AED 1886,Low
