In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import sys

# Define the file path as a variable
csv_file = "D:\Data_Analytics\Data_source\extracted_data\Travel\Top_Indian_Places_tovisit.csv"

# Load the CSV file using the provided file path
try:
    data = pd.read_csv(csv_file)
    print("CSV file loaded successfully!")
    # Now you can work with your data
except FileNotFoundError:
    print(f"Error: File '{csv_file}' not found.")

In [None]:
# Display the first few rows of the dataframe
data.head()

In [None]:
#Remove unnecessary columns
data_cleaned = data.drop(columns=['Unnamed: 0'])

#Check data types
data_cleaned.dtypes


In [None]:
#Check for missing values
data_cleaned.isnull().sum()

In [None]:
# Convert 'Establishment Year' to numeric (errors='coerce' will set invalid parsing as NaN)
data_cleaned['Establishment Year'] = pd.to_numeric(data_cleaned['Establishment Year'], errors='coerce')

# Check for unique values in categorical columns to ensure consistency
unique_zones = data_cleaned['Zone'].unique()
unique_types = data_cleaned['Type'].unique()
unique_weekly_off = data_cleaned['Weekly Off'].unique()
unique_significance = data_cleaned['Significance'].unique()
unique_dslr_allowed = data_cleaned['DSLR Allowed'].unique()
unique_best_time_to_visit = data_cleaned['Best Time to visit'].unique()

(unique_zones, unique_types, unique_weekly_off, unique_significance, unique_dslr_allowed, unique_best_time_to_visit)

In [None]:
# Cleanup Steps
# 1. Type Consistency: Simplify by combining similar categories
data_cleaned['Type'] = data_cleaned['Type'].replace({'Temples': 'Temple', 'Tombs': 'Tomb'})

# 2. Weekly Off Correction: Assuming 'Yes' is an error and without clear indication of what it should be, we can set it to NaN
data_cleaned['Weekly Off'] = data_cleaned['Weekly Off'].replace('Yes', None)

# 3. Best Time to Visit Consistency: Normalize values
data_cleaned['Best Time to visit'] = data_cleaned['Best Time to visit'].str.strip()
data_cleaned['Best Time to visit'] = data_cleaned['Best Time to visit'].replace({'Anytime': 'All'})

# Re-check the corrected columns
unique_types_corrected = data_cleaned['Type'].unique()
unique_weekly_off_corrected = data_cleaned['Weekly Off'].unique()
unique_best_time_to_visit_corrected = data_cleaned['Best Time to visit'].unique()
(unique_types_corrected, unique_weekly_off_corrected, unique_best_time_to_visit_corrected)

In [None]:
# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Distribution of Landmarks by Zone
plt.figure(figsize=(10, 6))
sns.countplot(y='Zone', data=data_cleaned, order = data_cleaned['Zone'].value_counts().index)
plt.title('Distribution of Landmarks by Zone')
plt.xlabel('Number of Landmarks')
plt.ylabel('Zone')
plt.show()

# Distribution of Landmarks by State (Top 10)
plt.figure(figsize=(10, 6))
state_counts = data_cleaned['State'].value_counts().head(10)
sns.barplot(x=state_counts, y=state_counts.index)
plt.title('Top 10 States by Number of Landmarks')
plt.xlabel('Number of Landmarks')
plt.ylabel('State')
plt.show()

# Types of Landmarks
plt.figure(figsize=(10, 6))
type_counts = data_cleaned['Type'].value_counts().head(20)
sns.barplot(x=type_counts, y=type_counts.index)
plt.title('Top 20 Types of Landmarks')
plt.xlabel('Number of Landmarks')
plt.ylabel('Type')
plt.show()

In [None]:
#Establishment Year Analysis
#Filter out landmarks with missing or illogical establishment years
establishment_years_filtered = data_cleaned[data_cleaned['Establishment Year'] > 0]['Establishment Year']

plt.figure(figsize=(12, 8))
sns.histplot(establishment_years_filtered, bins=30, kde=False)
plt.title('Distribution of Establishment Years')
plt.xlabel('Establishment Year')
plt.ylabel('Number of Landmarks')
plt.show()

# 2. Review Ratings and Entrance Fees
plt.figure(figsize=(12, 8))
sns.scatterplot(data=data_cleaned, x='Google review rating', y='Entrance Fee in INR')
plt.title('Review Ratings vs. Entrance Fees')
plt.xlabel('Google Review Rating')
plt.ylabel('Entrance Fee in INR')
plt.show()



In [None]:
#Time Needed to Visit
plt.figure(figsize=(10, 6))
sns.histplot(data_cleaned['time needed to visit in hrs'], bins=20, kde=True)
plt.title('Distribution of Time Needed to Visit')
plt.xlabel('Time Needed to Visit (hrs)')
plt.ylabel('Number of Landmarks')
plt.show()

#Best Time to Visit
plt.figure(figsize=(10, 6))
best_time_counts = data_cleaned['Best Time to visit'].value_counts()
sns.barplot(x=best_time_counts, y=best_time_counts.index)
plt.title('Best Time to Visit Landmarks')
plt.xlabel('Number of Landmarks')
plt.ylabel('Best Time to Visit')
plt.show()


In [None]:
# Categorize landmarks into historical periods based on their establishment year
conditions = [
    (data_cleaned['Establishment Year'] <= 500),
    (data_cleaned['Establishment Year'] > 500) & (data_cleaned['Establishment Year'] <= 1500),
    (data_cleaned['Establishment Year'] > 1500) & (data_cleaned['Establishment Year'] <= 1857),
    (data_cleaned['Establishment Year'] > 1857) & (data_cleaned['Establishment Year'] <= 1947),
    (data_cleaned['Establishment Year'] > 1947)
]
choices = ['Ancient India', 'Medieval India', 'Early Modern India', 'Colonial India', 'Post-Independence']

data_cleaned['Historical Era'] = pd.cut(data_cleaned['Establishment Year'], bins=[-float("inf"), 500, 1500, 1857, 1947, float("inf")], labels=choices, right=False)

# Overview of landmarks by historical era
era_distribution = data_cleaned['Historical Era'].value_counts()

# Visualize the distribution of landmarks by historical era
plt.figure(figsize=(10, 6))
sns.barplot(x=era_distribution, y=era_distribution.index, palette='viridis')
plt.title('Distribution of Landmarks by Historical Era')
plt.xlabel('Number of Landmarks')
plt.ylabel('Historical Era')
plt.show()

In [None]:
#Number of Landmarks by State
landmarks_by_state = data_cleaned.groupby('State')['Name'].count().sort_values(ascending=False)

#Types of Landmarks by State (Top 5 States)
top_5_states = landmarks_by_state.head(5).index
for state in top_5_states:
    plt.figure(figsize=(10, 6))
    state_data = data_cleaned[data_cleaned['State'] == state]
    state_types_count = state_data['Type'].value_counts().head(10)
    sns.barplot(x=state_types_count, y=state_types_count.index)
    plt.title(f'Top 10 Landmark Types in {state}')
    plt.xlabel('Number of Landmarks')
    plt.ylabel('Type')
    plt.show()

In [None]:
# Distribution of Landmarks by Cultural Significance
significance_counts = data_cleaned['Significance'].value_counts()

plt.figure(figsize=(12, 8))
sns.barplot(x=significance_counts, y=significance_counts.index)
plt.title('Distribution of Landmarks by Cultural Significance')
plt.xlabel('Number of Landmarks')
plt.ylabel('Cultural Significance')
plt.show()



In [None]:
top_5_names = data_cleaned['Significance'].value_counts().head(5).index
top_5_rating_place = data_cleaned[data_cleaned['Significance'].isin(top_5_names)]
plt.figure(figsize=(12, 8))
sns.countplot(x='Google review rating', hue='Significance', data=top_5_rating_place)
plt.title('Top 5 Rated Place Types')
plt.show()

In [None]:
plt.figure(figsize=(25, 6))
sns.barplot(x='State', y='Google review rating', data=data_cleaned, palette='viridis')
plt.title('Average Google Review Rating by State')
plt.xlabel('State')
plt.ylabel('Average Google Review Rating')
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(y='Significance',hue='DSLR Allowed',data=data_cleaned);

In [None]:
# Select relevant numerical variables for correlation analysis
numerical_data = data_cleaned[['Establishment Year', 'time needed to visit in hrs', 'Google review rating', 'Entrance Fee in INR', 'Number of google review in lakhs']]

# Calculate the correlation matrix
correlation_matrix = numerical_data.corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Variables')
plt.show()