In [None]:
!pip install pandas networkx python-louvain matplotlib plotly nbformat

## Section 1. Comminity Detection

#### Louvain Community Detection Algorithm on Expedia Dataset

**Introduction**

Community detection is a crucial task in graph analysis that helps uncover the underlying structure of graph networks by identifying groups or communities of nodes that are more densely connected internally than with the rest of the network. 
One of the most popular algorithms for community detection is the Louvain algorithm due to its efficiency and effectiveness.

In this notebook, we will explore the Louvain algorithm for community detection and apply it to the Expedia dataset 
to uncover patterns and insights that can help understand user behavior, marketing strategies, and more.

**1. Understanding the Louvain Algorithm**

The Louvain algorithm is a hierarchical community detection method that aims to optimize the modularity of a network. 
Modularity is a measure of the strength of division of a network into communities. 

The algorithm follows these steps:

1. Initialization: Each node in the network starts as its own community.
2. Local Modularity Optimization: Nodes are moved between communities to increase modularity. 
   This step continues iteratively until no further improvement can be made.
3. Community Aggregation: Once the best modularity is achieved, communities are treated as nodes themselves, 
   and steps 1 and 2 are repeated, creating a hierarchy of communities.
4. Final Community Structure: The process stops when no further modularity gain is possible, 
   resulting in the final community structure.

In [1]:
## 2. Preparing the Expedia Dataset

"""
We will begin by loading and preparing the Expedia dataset. This involves loading the dataset into a DataFrame, 
cleaning the data, and constructing a graph that represents user-hotel interactions.
"""

import pandas as pd
import networkx as nx

# Load the dataset
expedia_data = pd.read_csv('data.csv')  # Replace with actual dataset path

# Display the first few rows of the dataset
expedia_data.head()

"""
Now, we will clean the data by handling missing values, removing duplicates, 
and selecting relevant features for our graph construction.
"""

# Example of data cleaning (adjust based on actual dataset)
expedia_data = expedia_data.dropna()  # Remove missing values
expedia_data = expedia_data.drop_duplicates()  # Remove duplicates

# Display the cleaned data
expedia_data.head()


Unnamed: 0.1,Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
4,40586,2014-08-28 10:14:49,2,3,66,348,25443,9.3326,152970,0,...,0,1,12126,1,0,1,2,50,609,18
5,116747,2014-08-23 23:43:03,2,3,66,356,22202,536.0621,437458,1,...,0,1,8279,1,0,2,2,50,1230,19
6,432062,2014-08-12 14:16:57,2,3,66,220,19416,4870.7227,321691,0,...,0,1,53606,6,0,1,6,204,1449,30
7,149561,2014-08-22 08:13:06,2,3,66,363,12346,84.2274,573429,0,...,0,1,8267,1,0,1,2,50,675,70
8,262322,2014-08-27 18:19:47,2,3,66,348,53377,1561.3494,942802,0,...,0,1,8864,1,0,4,4,47,1508,52


In [2]:
"""
Next, we'll construct a graph where each node represents a user or hotel, and an edge represents a booking or search.
"""

# Constructing a user-hotel interaction graph
G = nx.Graph()

for index, row in expedia_data.iterrows():
    user_id = row['user_id']
    hotel_cluster = row['hotel_cluster']
    G.add_edge(user_id, hotel_cluster)

# Display basic information about the graph
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
# print(f"Is the graph connected? {'Yes' if nx.is_connected(G) else 'No'}")

Number of nodes: 1675
Number of edges: 1627


In [3]:
## 3. Applying the Louvain Algorithm

"""
With the graph prepared, we can now apply the Louvain algorithm to detect communities.
"""

import community as community_louvain
import networkx as nx
import plotly.graph_objects as go

# Apply the Louvain algorithm to detect communities
partition = community_louvain.best_partition(G)

"""
Let's visualize the detected communities to understand the structure of the network.
"""

# Get positions for each node
pos = nx.spring_layout(G)

# Create an edge trace
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines')

# Create node traces for each community
node_traces = []
for community_number in set(partition.values()):
    node_x = []
    node_y = []
    for node in partition.keys():
        if partition[node] == community_number:
            x, y = pos[node]
            node_x.append(x)
            node_y.append(y)
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            size=10,
            line_width=2),
        name=f'Community {community_number}')
    node_traces.append(node_trace)

# Create the plot
fig = go.Figure(data=[edge_trace] + node_traces,
                layout=go.Layout(
                    title='<br>Louvain Community Detection Visualization',
                    titlefont_size=16,
                    showlegend=True,
                    hovermode='closest',
                    margin=dict(b=0, l=0, r=0, t=40),
                    annotations=[dict(
                        text="Interactive visualization of communities",
                        showarrow=False,
                        xref="paper", yref="paper",
                        x=0.005, y=-0.002)],
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False))
               )

# Show the figure
fig.show()



**4. Analyzing the Results**

After applying the Louvain algorithm, we can analyze the resulting communities to gain insights.

**Community Profiling**

We can start by understanding the characteristics of each community. For instance, are there communities 
that primarily book luxury hotels versus budget hotels?

**User Segmentation**

Segment users based on their community membership. This can help in targeting marketing efforts or customizing user experiences.

**Market Insights**

Explore how different hotel chains or locations cluster together within communities. This could reveal competition or collaboration opportunities.

In [19]:
# Community Profiling
# Objective: Understand the characteristics of each community, 
# such as their booking preferences (luxury vs. budget hotels).

import pandas as pd

# Assuming 'cnt' is a column that indicates the Numer of similar events in the context of the same user session
# Adding cluster labels to the dataset
expedia_data['cluster'] = expedia_data['user_id'].map(partition)

# Community Profiling: Average hotel star rating per cluster
community_profile = expedia_data.groupby('cluster')['cnt'].mean().reset_index()
community_profile.columns = ['Cluster', 'Average Hotel Star Rating']

print("Community Profiling: Average Hotel Star Rating per Cluster")
(community_profile)


Community Profiling: Average Hotel Star Rating per Cluster


Unnamed: 0,Cluster,Average Hotel Star Rating
0,0,1.301887
1,1,1.000000
2,2,1.583333
3,3,1.344828
4,4,1.870370
...,...,...
61,61,2.000000
62,62,1.166667
63,63,1.235294
64,64,2.000000


In [20]:
# Optional: Analyzing booking behavior (e.g., total bookings per cluster)
booking_behavior = expedia_data.groupby('cluster')['is_booking'].sum().reset_index()
booking_behavior.columns = ['Cluster', 'Total Bookings']

print("\nBooking Behavior per Cluster")
(booking_behavior)


Booking Behavior per Cluster


Unnamed: 0,Cluster,Total Bookings
0,0,9
1,1,2
2,2,0
3,3,2
4,4,4
...,...,...
61,61,0
62,62,1
63,63,4
64,64,0


In [23]:
# User Segmentation
# Objective: Segment users based on their community membership 
# to target marketing efforts or customize user experiences.

# User Segmentation: Count of users per cluster
user_segmentation = expedia_data.groupby('cluster')['user_id'].nunique().reset_index()
user_segmentation.columns = ['Cluster', 'Number of Users']

print("User Segmentation: Number of Users per Cluster")
(user_segmentation)

# Optional: Analyze demographic information if available (e.g., age, gender)
# Assuming 'age' and 'gender' are available in the dataset


User Segmentation: Number of Users per Cluster


Unnamed: 0,Cluster,Number of Users
0,0,52
1,1,10
2,2,12
3,3,27
4,4,49
...,...,...
61,61,4
62,62,6
63,63,34
64,64,1


In [16]:
import plotly.graph_objects as go
import pandas as pd

# Add cluster labels to the dataset
expedia_data['cluster'] = expedia_data['user_id'].map(partition)

# Convert 'date_time' to datetime if it's not already
expedia_data['date'] = pd.to_datetime(expedia_data['date_time'])

# Perform trend analysis on 'is_booking' over time for each cluster
trend_analysis = expedia_data.groupby(['cluster', pd.Grouper(key='date', freq='M')])['is_booking'].sum().reset_index()

# Plotting the trend for each cluster using Plotly
clusters = trend_analysis['cluster'].unique()
fig = go.Figure()

for cluster in clusters:
    cluster_data = trend_analysis[trend_analysis['cluster'] == cluster]
    fig.add_trace(go.Scatter(
        x=cluster_data['date'],
        y=cluster_data['is_booking'],
        mode='lines',
        name=f'Cluster {cluster}'
    ))

# Updating the layout of the plot
fig.update_layout(
    title='Trend Analysis of Bookings by Cluster',
    xaxis_title='Date',
    yaxis_title='Total Bookings',
    hovermode='x unified',
    legend_title='Clusters',
    template='plotly'
)

# Show the plot
fig.show()



'M' is deprecated and will be removed in a future version, please use 'ME' instead.



**5. Practical Considerations**

**Scalability**

The Louvain algorithm scales well with large datasets, but it’s essential to ensure your graph structure is optimized for performance.

**Modularity Limitation**

While modularity is a popular measure, it has limitations, such as resolution limits where small communities might be merged into larger ones.

**Interpretability**

Understanding the meaning of detected communities is crucial. Supplement the algorithm with domain knowledge to interpret results effectively.

**Conclusion**

The Louvain community detection algorithm provides a powerful tool for uncovering hidden structures within networks. 
By applying it to the Expedia dataset, you can gain valuable insights into user behavior, hotel preferences, 
and potential market strategies. This notebook has provided a solid foundation to start exploring community detection in your own projects.

In [26]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# Load the dataset
expedia_data = pd.read_csv('data.csv')  # Uncomment and specify the dataset path

# Create a graph
G = nx.Graph()

# Add nodes for users, hotels, and destinations
G.add_nodes_from(expedia_data['user_id'], label='user')
G.add_nodes_from(expedia_data['hotel_cluster'], label='hotel_cluster')
G.add_nodes_from(expedia_data['srch_destination_id'], label='destination')

# Add edges based on interactions
for _, row in expedia_data.iterrows():
    G.add_edge(row['user_id'], row['hotel_cluster'], weight=row['is_booking'])
    G.add_edge(row['user_id'], row['srch_destination_id'], weight=row['is_booking'])

# Choose a specific node to visualize (e.g., user_id = 1)
node_to_visualize = 1

# Get the ego graph (neighbors) for the selected node
ego_graph = nx.ego_graph(G, node_to_visualize)

# Visualize the node using Plotly for an interactive graph
pos = nx.spring_layout(ego_graph)
edge_x = []
edge_y = []
for edge in ego_graph.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=1, color='#888'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_text = []
for node in ego_graph.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(f"Node ID: {node}, Type: {G.nodes[node]['label']}")

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=node_text,
    hoverinfo='text',
    marker=dict(
        size=10,
        color='blue',
        line_width=2))

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title=f'Ego Network for Node {node_to_visualize}',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0, l=0, r=0, t=40),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False))
               )

fig.show()
