# Food Delivery Data Integration and Behavioral Analysis
**Objective:** Integrate multiple data formats (CSV, JSON, SQL) into a single analytical dataset and uncover key business insights regarding revenue, user behavior, and restaurant performance.

## 1. Environment Setup and Data Acquisition

In [None]:
import pandas as pd
import sqlite3
import json

# Loading raw data files
orders = pd.read_csv('orders.csv')
users = pd.read_json('users.json')

# Parsing SQL for restaurant master data
with open('restaurants.sql', 'r') as f:
    sql_query = f.read()

conn = sqlite3.connect(':memory:')
conn.executescript(sql_query)
restaurants = pd.read_sql_query('SELECT * FROM restaurants', conn)
conn.close()

print(f"Data Loaded Successfully:\n - Orders: {orders.shape[0]} rows\n - Users: {users.shape[0]} rows\n - Restaurants: {restaurants.shape[0]} rows")

## 2. Data Merging and Consolidation
Integrating transactional data with user and restaurant profiles using left joins to maintain all order records.

In [None]:
# Consolidating into a master dataframe
# First, we link users to their orders
master_df = orders.merge(users, on='user_id', how='left')

# Next, we add restaurant details (cuisine, rating, etc.)
master_df = master_df.merge(restaurants, on='restaurant_id', how='left', suffixes=('', '_master'))

# Quick validation of the merged dataset
print("Master Dataset Summary:")
print(master_df.info())
master_df.to_csv('final_food_delivery_dataset.csv', index=False)
master_df.head()

## 3. Targeted Business Insights

### Revenue Leadership (Gold Members)
Which city contributes the most to revenue from our Gold membership base?

In [None]:
gold_revenue_by_city = master_df[master_df['membership'] == 'Gold'].groupby('city')['total_amount'].sum()
print(f"City with highest Gold revenue: {gold_revenue_by_city.idxmax()} (INR {gold_revenue_by_city.max():,.2f})")

### Cuisine Performance (AOV)
Identifying which cuisine type yields the highest Average Order Value (AOV).

In [None]:
cuisine_aov = master_df.groupby('cuisine')['total_amount'].mean()
print(f"Cuisine with highest AOV: {cuisine_aov.idxmax()} (INR {cuisine_aov.max():.2f})")

### User Retention and High-Value Customers
How many distinct users have spent more than INR 1,000 in total across all their orders?

In [None]:
high_value_users = master_df.groupby('user_id')['total_amount'].sum()
user_count = (high_value_users > 1000).sum()
print(f"Number of high-value users (>1000 INR): {user_count}")

### Impact of Restaurant Ratings
Which rating range correlates with the highest total revenue?

In [None]:
bins = [3.0, 3.55, 4.05, 4.55, 5.05]
labels = ['3.0-3.5', '3.6-4.0', '4.1-4.5', '4.6-5.0']
master_df['rating_range'] = pd.cut(master_df['rating'], bins=bins, labels=labels, include_lowest=True)

revenue_by_rating = master_df.groupby('rating_range', observed=True)['total_amount'].sum()
print(f"Top revenue generating rating range: {revenue_by_rating.idxmax()} (INR {revenue_by_rating.max():,.2f})")

### Membership vs Cuisine Trends
Analyzing revenue for specific membership and cuisine combinations.

In [None]:
membership_cuisine_revenue = master_df.groupby(['membership', 'cuisine'])['total_amount'].sum()

# Inspecting specific pairs
pairs = [('Gold', 'Indian'), ('Gold', 'Italian'), ('Regular', 'Indian'), ('Regular', 'Chinese')]
for membership, cuisine in pairs:
    print(f"{membership} + {cuisine}: INR {membership_cuisine_revenue[membership][cuisine]:,.2f}")

### Order and Revenue Metrics

In [None]:
# Percentage of orders from Gold members
gold_order_pct = (master_df['membership'] == 'Gold').mean() * 100
print(f"Percentage of Gold member orders: {gold_order_pct:.0f}%")

# Hyderabad Total Revenue (Rounded)
hyd_revenue = master_df[master_df['city'] == 'Hyderabad']['total_amount'].sum()
print(f"Total Revenue in Hyderabad: INR {round(hyd_revenue):,d}")

### Temporal Trends (Seasonality)
Identifying which quarter of the year saw the peak in total revenue.

In [None]:
master_df['order_date'] = pd.to_datetime(master_df['order_date'], dayfirst=True)
quarterly_revenue = master_df.groupby(master_df['order_date'].dt.to_period('Q'))['total_amount'].sum()
print(f"Peak Quarter: {quarterly_revenue.idxmax()} (INR {quarterly_revenue.max():,.2f})")