In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# Define the folder path containing the CSV files
folder_path = 'datasets'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Load all CSV files into a list of DataFrames
df_list = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    df_list.append(df)

# Merge all DataFrames into one
merged_df = pd.concat(df_list, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_output.csv', index=False)

In [6]:
merged_df.head()

Unnamed: 0,Entry Point,Month,Value,Year,Category,Region,Conservation Area,Visitors,Metric,English_Year,...,Details,Third Country,Indian,% Change,Gender,Age_Group,Total,Nationality,Percent,Year;Month;Value
0,Mohana (Kailali),Jan,0,,,,,,,,...,,,,,,,,,,
1,Mohana (Kailali),Feb,0,,,,,,,,...,,,,,,,,,,
2,Mohana (Kailali),Mar,0,,,,,,,,...,,,,,,,,,,
3,Mohana (Kailali),Apr,0,,,,,,,,...,,,,,,,,,,
4,Mohana (Kailali),May,0,,,,,,,,...,,,,,,,,,,


In [8]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Entry Point        96 non-null     object 
 1   Month              2970 non-null   object 
 2   Value              4047 non-null   object 
 3   Year               4917 non-null   object 
 4   Category           361 non-null    object 
 5   Region             216 non-null    object 
 6   Conservation Area  270 non-null    object 
 7   Visitors           269 non-null    object 
 8   Metric             405 non-null    object 
 9   English_Year       70 non-null     object 
 10  Group              624 non-null    object 
 11  Count              622 non-null    object 
 12  Type               236 non-null    object 
 13  Hotel_Type         42 non-null     object 
 14  Hotel_Count        42 non-null     float64
 15  Bed_Count          42 non-null     float64
 16  Types of Course    153 n

In [11]:
# Define month mapping
month_map = {
    'Jan': 'January', 'January': 'January',
    'Feb': 'February', 'February': 'February',
    'Mar': 'March', 'March': 'March',
    'Apr': 'April', 'April': 'April',
    'May': 'May',
    'Jun': 'June', 'June': 'June',
    'Jul': 'July', 'July': 'July',
    'Aug': 'August', 'August': 'August',
    'Sep': 'September', 'September': 'September',
    'Oct': 'October', 'October': 'October',
    'Nov': 'November', 'November': 'November',
    'Dec': 'December', 'December': 'December',
    'Total': 'Total'  # Keep Total if needed
}

# Apply mapping to your dataframe
merged_df['Month'] = merged_df['Month'].map(month_map)

# Now run your seasonality calculation
seasonality = (
    merged_df
    .groupby('Month')['Value']
    .mean()
    .reset_index()
    .sort_values('Month')
)

print("Seasonality (Avg Monthly Arrivals):\n", seasonality)

Seasonality (Avg Monthly Arrivals):
         Month         Value
0       April  11634.632124
1      August   9126.658031
2    December  11315.233161
3    February  11026.367876
4     January   8697.259067
5        July   6710.751295
6        June   7084.031088
7       March  13479.466321
8         May    8288.11399
9    November  14097.590674
10    October  15138.134715
11  September   9919.979275
12      Total      456868.9


In [12]:
entry_point_analysis = (
    merged_df
    .groupby('Entry Point')['Value']
    .sum()
    .reset_index()
    .sort_values('Value', ascending=False)
)

print("Tourist Arrivals by Entry Point:\n", entry_point_analysis)


Tourist Arrivals by Entry Point:
                 Entry Point  Value
0     Belahiya (Bhairahawa)  92365
7       Rasuwagadi (Rasuwa)  14632
5    Kodari (Sindupalchowk)   7184
4       Kakarbhitta (Mechi)   6101
2           Birgunj (Parsa)   1887
3  Gaddachauki (Kanchanpur)   1127
1       Biratnagar (Morang)    528
6          Mohana (Kailali)     29


In [14]:
gender_distribution = (
    merged_df
    .groupby('Gender')['Value']
    .sum()
    .reset_index()
)

print("Gender-wise Distribution:\n", gender_distribution)


Gender-wise Distribution:
    Gender       Value
0  Female  7592604.02
1    Male  9790683.94


In [20]:
indian_vs_others = (
    merged_df
    .groupby(['Indian', 'Third Country'])['Value']
    .sum()
    .reset_index()
)

print("Indian vs Third Country Tourists:\n", indian_vs_others)


Indian vs Third Country Tourists:
       Indian Third Country Value
0    40336.0        189749     0
1    64320.0        296917     0
2    64672.0         86290     0
3    66777.0        208691     0
4    74814.0        464156     0
5    86363.0        251769     0
6    90326.0        294971     0
7    91177.0        409100     0
8    93722.0        290204     0
9    93884.0        416072     0
10   95915.0        367731     0
11   96010.0        430695     0
12   96434.0        278964     0
13  118249.0       634,753     0
14  120898.0        481969     0
15  122512.0        271101     0
16  133438.0        288419     0
17  135343.0        654775     0
18  140661.0        350843     0
19  143229.0        320455     0
20  149504.0        586711     0
21  160832.0        779386     0
22  165815.0        637277     0
23  180974.0        616642     0
24  194323.0        978749     0
25  209334.0        405535     0
26  254150.0        943041     0
27  317781.0        829767     0
28  3199

In [21]:
nationality_share = (
    merged_df
    .groupby('Nationality')['Value']
    .sum()
    .reset_index()
)

nationality_share['Percent'] = (
    nationality_share['Value'] / nationality_share['Value'].sum() * 100
)

nationality_share = nationality_share.sort_values('Percent', ascending=False)

print("Nationality Share (%):\n", nationality_share)


Nationality Share (%):
     Nationality      Value    Percent
17        Total    8550917  44.434438
12       Others  2043702.0  10.620001
6         India    1890080   9.821712
3         China  1115937.0   5.798914
19       U.S.A.     750930   3.902172
18         U.K.     636357   3.306799
15    Sri Lanka   450451.0   2.340747
0     Australia   413492.0   2.148692
16     Thailand   401151.0   2.084562
1    Bangladesh   346056.0   1.798264
21          USA   329276.0   1.711067
5       Germany   324186.0   1.684617
4        France   289617.0   1.504981
13  South Korea   262314.0   1.363102
8         Japan   253778.0   1.318745
10      Myanmar   233366.0   1.212675
20           UK   173166.0   0.899849
2        Canada   172236.0   0.895016
9      Malaysia   168706.0   0.876673
14        Spain   160194.0   0.832441
7         Italy   139128.0   0.722972
11  Netherlands   138856.0   0.721559


In [22]:
conservation_area_visitors = (
    merged_df
    .groupby('Conservation Area')['Value']
    .sum()
    .reset_index()
    .sort_values('Value', ascending=False)
)

print("Visitors by Conservation Area:\n", conservation_area_visitors)


Visitors by Conservation Area:
                   Conservation Area Value
0       Annapurna Conservation Area     0
11           Langtang National Park     0
19    Shukla Phata Wildlife Reserve     0
18  ShivapuriNagarjun National Park     0
17      SheyPhoksundo National Park     0
16         Sagarmatha National Park     0
15               Rara National Park     0
14           Parsa Wildlife Reserve     0
13        Manaslu Conservation Area     0
12       Makalu Barun National Park     0
10     Krishnasar Conservation Area     0
1        Apinappa Conservation Area     0
9       KoshiTappu Wildlife Reserve     0
8             Khaptad National Park     0
7    Kanchanjunga Conservation Area     0
6    Gaurishankar Conservation Area     0
5         Dhorpatan Hunting Reserve     0
4             Chitwan National Park     0
3             Bardiya National Park     0
2               Banke National Park     0
20     ShuklaPhata Wildlife Reserve     0
