In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


In [2]:
# Define the folder path containing the CSV files
folder_path = 'datasets'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Load all CSV files into a list of DataFrames
df_list = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    df_list.append(df)

# Merge all DataFrames into one
merged_df = pd.concat(df_list, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_output.csv', index=False)


In [3]:
# Calculate total tourist arrivals by Year and Month
monthly_trend = merged_df.groupby(['Year', 'Month'])['Value'].sum().reset_index()

# Display the monthly trend
print("Monthly Trend:\n", monthly_trend)


Monthly Trend:
      Year      Month     Value
0    1995        Apr     33994
1    1995        Aug     27686
2    1995        Dec     26380
3    1995        Feb     28240
4    1995        Jan     22207
..    ...        ...       ...
530  2024        Oct  230149.0
531  2024    October     34872
532  2024        Sep  163338.0
533  2024  September     20540
534  2024      Total    829767

[535 rows x 3 columns]


In [4]:
# Calculate monthly average arrivals
monthly_avg = merged_df.groupby('Month')['Value'].mean().reset_index()

# Merge monthly averages back into the main DataFrame
merged_df = pd.merge(merged_df, monthly_avg, on='Month', suffixes=('', '_avg'))

# Calculate the Seasonality Index
merged_df['Seasonality_Index'] = merged_df['Value'] / merged_df['Value_avg']

# Display the Seasonality Index
print("Seasonality Index:\n", merged_df[['Year', 'Month', 'Seasonality_Index']])


Seasonality Index:
       Year  Month Seasonality_Index
0      NaN    Jan               0.0
1      NaN    Jan          0.003368
2      NaN    Jan          0.004715
3      NaN    Jan          0.025258
4      NaN    Jan          0.112146
...    ...    ...               ...
2965  2020  Total          0.415325
2966  2021  Total          0.188873
2967  2022  Total           0.88764
2968  2023  Total          1.521106
2969  2024  Total          1.816204

[2970 rows x 3 columns]


In [5]:
# Calculate arrivals by top nationalities
top_nationalities_except_india = merged_df.groupby('Nationality')['Value'].sum().reset_index()

# Sort by total arrivals
top_nationalities_sorted = top_nationalities_except_india.sort_values(by='Value', ascending=False)

# Display the top nationalities
print("Top Nationalities:\n", top_nationalities_sorted)


Top Nationalities:
     Nationality     Value
11       Others  364429.0
17          USA  329276.0
3         China  197806.0
16           UK  173166.0
1    Bangladesh  120676.0
0     Australia  118941.0
15     Thailand   87713.0
5       Germany   83346.0
4        France   72588.0
14    Sri Lanka   70475.0
12  South Korea   67258.0
8      Malaysia   53120.0
7         Japan   51414.0
9       Myanmar   49521.0
2        Canada   46058.0
13        Spain   37486.0
6         Italy   36020.0
10  Netherlands   33490.0
