## **TOURMAP: Population & Geography Analysis**


In [12]:
# Done in google colab, so we import the dataset from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/TOURMAP/worldcities.csv")
df

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6870,139.7495,Japan,JP,JPN,Tōkyō,primary,37785000,1392685764
1,Jakarta,Jakarta,-6.1750,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000,1360771077
2,Delhi,Delhi,28.6100,77.2300,India,IN,IND,Delhi,admin,32226000,1356872604
3,Guangzhou,Guangzhou,23.1300,113.2600,China,CN,CHN,Guangdong,admin,26940000,1156237133
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000,1356226629
...,...,...,...,...,...,...,...,...,...,...,...
47799,Bol’sheretsk,Bol'sheretsk,52.4390,156.3594,Russia,RU,RUS,Kamchatskiy Kray,,10,1643981807
47800,Utkholok,Utkholok,57.5504,157.2333,Russia,RU,RUS,Kamchatskiy Kray,,10,1643251905
47801,Yessey,Yessey,68.4652,102.1887,Russia,RU,RUS,Krasnoyarskiy Kray,,10,1643816547
47802,Karamken,Karamken,60.2004,151.1666,Russia,RU,RUS,Magadanskaya Oblast’,,10,1643511192


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
from scipy.stats import linregress,ttest_ind
from scipy import stats

**1. Do capital cities have significantly higher populations than non-capital cities? (Hypothesis test: two-sample t-test)**

In [None]:
def capitals_vs_noncapitals():
  # Split into capital vs non-capital
  capitals = df[df['capital'].notna() & (df['capital'] != "admin")]['population'].dropna()
  non_capitals = df[df['capital'].isna()]['population'].dropna()

  # Two-sample t-test
  t_stat, p_val = ttest_ind(capitals, non_capitals, equal_var=False)
  print(f"t-statistic: {t_stat:.2f}, p-value: {p_val:.2e}")

  # Boxplot
  plt.boxplot([capitals, non_capitals], labels=['Capitals','Non-Capitals'])
  plt.ylabel("Population (log scale)")
  plt.yscale("log")
  plt.title("Capital vs Non-Capital Populations")
  plt.show()

  # Bar chart of means
  plt.bar(['Capitals','Non-Capitals'], [capitals.mean(), non_capitals.mean()], color=['blue','orange'])
  plt.ylabel("Average Population")
  plt.title("Mean Population Comparison")
  plt.show()

**2. Can we statistically model the relationship between city rank and population size using regression and goodness-of-fit tests?**

In [None]:
def rank_vs_population():

    if 'population' not in df.columns:
        raise ValueError("No 'population' column found.")

    df_local = df.dropna(subset=['population']).copy()

    # 2. Create rank variable if not present
    if 'rank' in df_local.columns:
        df_local['rank'] = df_local['rank'].astype(int)
    else:
        df_local = df_local.sort_values(by='population', ascending=False).reset_index(drop=True)
        df_local['rank'] = df_local.index + 1  # rank 1 is highest population

    # 3. Explore with scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(df_local['rank'], df_local['population'], alpha=0.5)
    plt.xlabel("City Rank")
    plt.ylabel("Population")
    plt.title("Population vs. City Rank (Linear scale)")
    plt.show()

    # Log-log plot
    plt.figure(figsize=(10, 6))
    plt.scatter(np.log10(df_local['rank']), np.log10(df_local['population']), alpha=0.5, color='orange')
    plt.xlabel("log10(City Rank)")
    plt.ylabel("log10(Population)")
    plt.title("Population vs. City Rank (Log-Log)")
    plt.show()

    # 4a. Linear regression (population ~ rank)
    X = sm.add_constant(df_local['rank'])
    model_lin = sm.OLS(df_local['population'], X).fit()
    print("Linear Regression Results:")
    print(model_lin.summary())

    # 4b. Log-log regression (log-population ~ log-rank)
    df_local['log_rank'] = np.log10(df_local['rank'])
    df_local['log_pop'] = np.log10(df_local['population'])
    Xlog = sm.add_constant(df_local['log_rank'])
    model_log = sm.OLS(df_local['log_pop'], Xlog).fit()
    print("\nLog‑Log Regression Results:")
    print(model_log.summary())

    # 5. Visualize fitted log-log model
    plt.figure(figsize=(10, 6))
    plt.scatter(df_local['log_rank'], df_local['log_pop'], alpha=0.5, label="Data Points")
    plt.plot(df_local['log_rank'], model_log.predict(Xlog), color='red', label="Fitted Line")
    plt.xlabel("log10(City Rank)")
    plt.ylabel("log10(Population)")
    plt.legend()
    plt.title("Log‑Log Regression Fit")
    plt.show()

    # 6. Residual diagnostics for log‑log model
    resid = model_log.resid

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    plt.hist(resid, bins=30, edgecolor='k')
    plt.title("Histogram of Residuals")
    plt.xlabel("Residual")
    plt.ylabel("Frequency")

    plt.subplot(1, 2, 2)
    sm.qqplot(resid, line='s', ax=plt.gca())
    plt.title("QQ Plot of Residuals")
    plt.tight_layout()
    plt.show()

    # Shapiro‑Wilk test for normality of residuals
    shapiro_stat, shapiro_p = stats.shapiro(resid.sample(n=min(5000, len(resid)), random_state=1))
    print(f"\nShapiro-Wilk Test: W = {shapiro_stat:.4f}, p-value = {shapiro_p:.4f}")
    if shapiro_p > 0.05:
        print("Residuals appear normally distributed (fail to reject H0).")
    else:
        print("Residuals are likely not normally distributed (reject H0).")

**3. Does distance from the equator influence city size? (Simple regression).**

In [None]:
def distance_from_equator():
  df_clean = df[['lat', 'population']].dropna()

  # Remove very small populations (to avoid errors/skew)
  df_clean = df_clean[df_clean['population'] > 1000]

  # Create new column: distance from equator (absolute latitude)
  df_clean['distance_equator'] = df_clean['lat'].abs()

  # Independent variable (X) and dependent variable (Y)
  X = sm.add_constant(df_clean['distance_equator'])  # add constant for regression
  y = df_clean['population']

  # Run regression
  model = sm.OLS(y, X).fit()

  # Show results
  print(model.summary())


  # Scatter plot
  plt.figure(figsize=(10,6))
  plt.scatter(df_clean['distance_equator'], df_clean['population'],
              alpha=0.3, s=10, label="Cities")

  # Regression line
  x = df_clean['distance_equator']
  y = df_clean['population']

  # Fit a line (same as regression)
  m, b = np.polyfit(x, y, 1)
  plt.plot(x, m*x + b, color='red', label="Regression Line")

  plt.xlabel("Distance from Equator")
  plt.ylabel("Population")
  plt.title("Relationship between Distance from Equator and City Population")
  plt.legend()
  plt.show()

**4. Is there any Correlation between Latitude [Climate Zone] and Population?**

In [None]:
def latitude_vs_population():
  lat_pop =df[["lat","population"]].dropna()
  lat_pop.head()

  correlation = lat_pop["lat"].corr(lat_pop["population"])
  print("Correlation between Latitude and Population:",correlation)
  plt.figure(figsize=(8,6))
  plt.scatter(lat_pop["lat"],lat_pop["population"],alpha=0.5)
  plt.xlabel("Latitude")
  plt.ylabel("Population")
  plt.title("Latitude vs City Population")
  plt.show()

  plt.figure(figsize=(8,6))
  sns.regplot(x="lat",y="population",data=lat_pop,
              scatter_kws={"alpha":0.4},line_kws={"color":"red"})
  plt.xlabel("Latitude")
  plt.ylabel("Population")
  plt.title("Latitude vs Population with Trend Line")
  plt.show()

**5. Do cities in the Northern Hemisphere have larger populations than those in the Southern Hemisphere?**


In [None]:
def hemisphere_comparison():
  # Drop rows where 'population' column has missing values
  df_h = df.dropna(subset=['population'])

  # Create a new column 'hemisphere' based on latitude: North if lat >= 0, else South
  df_h['hemisphere'] = df_h['lat'].apply(lambda x: 'North' if x >= 0 else 'South')

  # Calculate the mean population grouped by hemisphere
  mean_group = df_h.groupby('hemisphere')['population'].mean().reset_index()

  # Print detailed descriptive statistics (count, mean, std, min, quartiles, max) for population by hemisphere
  print(df_h.groupby('hemisphere')['population'].describe())

  # --- Visualization 1: Bar Plot ---
  plt.figure(figsize=(6,5))
  # Plot average population for each hemisphere as a bar chart
  sns.barplot(x='hemisphere', y='population', hue='hemisphere',
              data=mean_group, palette='Set2', legend=False)
  plt.title("Average City Population by Hemisphere")
  plt.ylabel("Average Population")
  plt.show()

  # --- Visualization 2: Histogram ---
  plt.figure(figsize=(8,6))
  # Plot histogram of population distribution (log scale for clarity), comparing North vs South Hemisphere
  sns.histplot(data=df_h, x='population', hue='hemisphere', bins=50, log_scale=True, element="step")
  plt.title("Population Distribution: North vs South Hemisphere")
  plt.xlabel("Population (log scale for clarity)")
  plt.show()

### **Dashboard**

In [None]:
from ipywidgets import Tab, Output, VBox, HTML
import matplotlib.pyplot as plt

# Mapping of analysis names to functions
options = {
    "Capital vs Non-Capital Populations": capitals_vs_noncapitals,
    "City Rank vs Population": rank_vs_population,
    "Distance from Equator vs Population": distance_from_equator,
    "Latitude vs Population": latitude_vs_population,
    "North vs South Hemisphere": hemisphere_comparison
}

# Create Output widgets for each tab
tab_outputs = [Output() for _ in range(len(options))]
tabs = Tab(children=tab_outputs)

# Set tab titles dynamically
for i, name in enumerate(options.keys()):
    tabs.set_title(i, name)

# Title banner with styling
title = HTML(
    value=f"""
    <h1 style='text-align:center; color:white; background-color:#2E86C1; padding:12px; border-radius:8px;'>
        TOURMAP: Population & Geography Analysis Dashboard
    </h1>
    <h3 style='text-align:center; color:#2C3E50;'>
        Interactive Statistical Analysis & Visualizations
    </h3>
    """,
)

# Function to render analysis when a tab is selected
def on_tab_change(change):
    if change["name"] == "selected_index":
        idx = change["new"]
        selected_func = list(options.values())[idx]
        with tab_outputs[idx]:
            tab_outputs[idx].clear_output(wait=True)  # Clear old outputs
            plt.close("all")  # Reset previous plots
            print(f"Running Analysis: {list(options.keys())[idx]}")
            selected_func()  # Run selected analysis

# Attach event listener to tabs
tabs.observe(on_tab_change, names="selected_index")

# Display dashboard with title + tabs
dashboard = VBox([title, tabs])
display(dashboard)

# Auto-run the first tab initially
with tab_outputs[0]:
    plt.close("all")
    list(options.values())[0]()


VBox(children=(HTML(value="\n    <h1 style='text-align:center; color:white; background-color:#2E86C1; padding:…