In [2]:
import pandas as pd
import numpy as np
import json

In [9]:
# Load the data
file_path = "tourism_dataset.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,Location,Country,Category,Visitors,Rating,Revenue,Accommodation_Available
0,kuBZRkVsAR,India,Nature,948853,1.32,84388.38,Yes
1,aHKUXhjzTo,USA,Historical,813627,2.01,802625.60,No
2,dlrdYtJFTA,Brazil,Nature,508673,1.42,338777.11,Yes
3,DxmlzdGkHK,Brazil,Historical,623329,1.09,295183.60,Yes
4,WJCCQlepnz,France,Cultural,124867,1.43,547893.24,No
...,...,...,...,...,...,...,...
5984,xAzwnVKAqz,USA,Urban,828137,1.97,132848.78,No
5985,IfKotyaJFC,France,Nature,276317,3.53,325183.96,Yes
5986,bPyubCWGgA,Egypt,Beach,809198,3.37,927336.50,No
5987,kkWIucpBnu,Egypt,Cultural,808303,2.52,115791.43,Yes


#### Even though the countries are same, the location ID shows that they are different locations in the country

In [10]:
# Basic summary statistics
summary_stats = {
    "total_locations": len(df),
    "total_visitors": int(df["Visitors"].sum()),
    "total_revenue": float(df["Revenue"].sum()),
    "average_rating": float(df["Rating"].mean())
}

print("Summary Statistics:")
print(f"Total Locations: {summary_stats['total_locations']}")
print(f"Total Visitors: {summary_stats['total_visitors']:,}")
print(f"Total Revenue: ${summary_stats['total_revenue']:,.2f}")
print(f"Average Rating: {summary_stats['average_rating']:.2f}")


Summary Statistics:
Total Locations: 5989
Total Visitors: 3,000,585,360
Total Revenue: $2,991,381,930.48
Average Rating: 3.01


In [11]:
# Analysis by Country
country_data = df.groupby("Country").agg({
    "Visitors": "sum",
    "Revenue": "sum",
    "Rating": "mean"
}).reset_index()

print("\nCountry Analysis:")
print(country_data)


Country Analysis:
     Country   Visitors       Revenue    Rating
0  Australia  416038005  4.126633e+08  3.019602
1     Brazil  414293518  4.267832e+08  3.074167
2      China  404448372  3.984324e+08  2.958648
3      Egypt  458573652  4.605948e+08  3.024298
4     France  424944621  4.105266e+08  3.030268
5      India  451083005  4.546763e+08  2.973158
6        USA  431204187  4.277053e+08  2.984304


In [12]:
# Analysis by Category
category_data = df.groupby("Category").agg({
    "Visitors": "sum",
    "Revenue": "sum",
    "Rating": "mean"
}).reset_index()

print("\nCategory Analysis:")
print(category_data)


Category Analysis:
     Category   Visitors       Revenue    Rating
0   Adventure  528962493  5.021662e+08  3.008804
1       Beach  495111800  4.972478e+08  3.072285
2    Cultural  495834336  5.181320e+08  2.997074
3  Historical  495958186  4.846126e+08  3.003139
4      Nature  469346177  4.772601e+08  2.978117
5       Urban  515372368  5.119633e+08  2.995457


In [13]:
# Accommodation Analysis
accommodation_data = df.groupby("Accommodation_Available").agg({
    "Visitors": "sum",
    "Revenue": "sum"
}).reset_index()

print("\nAccommodation Analysis:")
print(accommodation_data)


Accommodation Analysis:
  Accommodation_Available    Visitors       Revenue
0                      No  1514900791  1.490472e+09
1                     Yes  1485684569  1.500910e+09


In [14]:
# Revenue per Visitor by Country
country_data["Revenue_per_Visitor"] = country_data["Revenue"] / country_data["Visitors"]
print("\nRevenue per Visitor by Country:")
print(country_data[["Country", "Revenue_per_Visitor"]].sort_values("Revenue_per_Visitor", ascending=False))


Revenue per Visitor by Country:
     Country  Revenue_per_Visitor
1     Brazil             1.030147
5      India             1.007966
3      Egypt             1.004407
0  Australia             0.991889
6        USA             0.991886
2      China             0.985125
4     France             0.966071


In [15]:
# High rated vs Low rated locations comparison
df["Rating_Category"] = pd.cut(df["Rating"], bins=[0, 2, 3, 5], labels=["Low", "Medium", "High"])
rating_analysis = df.groupby("Rating_Category").agg({
    "Visitors": "sum",
    "Revenue": "sum",
    "Location": "count"
}).reset_index()

print("\nRating Category Analysis:")
print(rating_analysis)


Rating Category Analysis:
  Rating_Category    Visitors       Revenue  Location
0             Low   735251709  7.375906e+08      1485
1          Medium   781765746  7.599337e+08      1513
2            High  1483567905  1.493858e+09      2991


  rating_analysis = df.groupby("Rating_Category").agg({


In [18]:
# Country and Category combinations
country_category = df.groupby(["Country", "Category"]).agg({
    "Visitors": "sum",
    "Revenue": "sum"
}).reset_index()

print("\nTop 5 Country-Category Combinations by Visitors:")
print(country_category.sort_values("Visitors", ascending=False), country_category.sort_values("Visitors", ascending=False).shape)


Top 5 Country-Category Combinations by Visitors:
      Country    Category  Visitors      Revenue
6      Brazil   Adventure  83200861  77333822.40
18      Egypt   Adventure  82651445  82950318.40
30      India   Adventure  82298383  78370335.87
19      Egypt       Beach  81114198  77755196.13
21      Egypt  Historical  80783975  78510790.60
41        USA       Urban  80276628  81931474.27
28     France      Nature  79251754  75674952.86
23      Egypt       Urban  78968173  79616298.02
35      India       Urban  77068876  79879774.13
33      India  Historical  76491148  65788358.97
36        USA   Adventure  76417600  69406465.01
26     France    Cultural  75794317  74008400.22
0   Australia   Adventure  75244920  73143074.22
20      Egypt    Cultural  74325882  77438684.31
31      India       Beach  74275757  83256415.38
1   Australia       Beach  74188817  71734255.42
11     Brazil       Urban  72856618  74506386.34
29     France       Urban  72726465  69566833.74
40        USA      

In [19]:
# Correlation analysis
correlation = df[["Visitors", "Rating", "Revenue"]].corr()
print("\nCorrelation Analysis:")
print(correlation)


Correlation Analysis:
          Visitors    Rating   Revenue
Visitors  1.000000 -0.010337  0.008358
Rating   -0.010337  1.000000  0.000574
Revenue   0.008358  0.000574  1.000000


In [None]:
# Export data for visualization
export_data = {
    "summary": summary_stats,
    "country_data": country_data.to_dict(orient="records"),
    "category_data": category_data.to_dict(orient="records"),
    "accommodation_data": accommodation_data.to_dict(orient="records"),
    "rating_analysis": rating_analysis.to_dict(orient="records"),
    "top_country_category": country_category.sort_values("Visitors", ascending=False).head(5).to_dict(orient="records"),
    "correlation": correlation.to_dict()
}

# Save to JSON file
with open("tourism_analysis.json", "w") as f:
    json.dump(export_data, f, indent=2)

print("\nData analysis complete. Results saved to tourism_analysis.json")