# Scrape data

## 1. Happiness Score by State 2024

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
response = requests.get("https://worldpopulationreview.com/state-rankings/happiest-states")
soup = BeautifulSoup(response.text, "html.parser")
soup.title

<title>Happiest States 2025</title>

In [3]:
# find the table
table = soup.find("table")

# Fetch table head
headers = [th.text.strip() for th in table.find("thead").find_all("th") if th.text.strip()]

# Extract table data
data = []
rows = table.find("tbody").find_all("tr")
for row in rows:
    cols = row.find_all("td")
    if cols:
        values = [col.text.strip() for col in cols[1:]]
        data.append(values)

# Create DataFrame
happiness = pd.DataFrame(data, columns=headers)

In [4]:
happiness

Unnamed: 0,State,Total Happiness Scoreâ,Emotional & Physical Well-Being Rank,Community & Environment Rank,Work Environment Rank
0,Hawaii,66.31,1,7,17
1,Maryland,62.6,5,4,24
2,Minnesota,62.43,10,9,2
3,Utah,62.41,29,1,1
4,New Jersey,61.71,2,19,28
5,Idaho,61.6,25,2,3
6,California,59.97,12,5,14
7,Illinois,58.59,3,34,33
8,Nebraska,58.19,13,22,7
9,Connecticut,58.15,7,28,21


In [5]:
# Rename columns
happiness.columns = ["state", "total happiness score", "emotional & physical well-being rank",
                     "community & environment rank", "work environment rank"]
happiness

Unnamed: 0,state,total happiness score,emotional & physical well-being rank,community & environment rank,work environment rank
0,Hawaii,66.31,1,7,17
1,Maryland,62.6,5,4,24
2,Minnesota,62.43,10,9,2
3,Utah,62.41,29,1,1
4,New Jersey,61.71,2,19,28
5,Idaho,61.6,25,2,3
6,California,59.97,12,5,14
7,Illinois,58.59,3,34,33
8,Nebraska,58.19,13,22,7
9,Connecticut,58.15,7,28,21


Happiness scores were determined by evaluating 30 indicators across three main categories: 1) Emotional & Physical Well-Being, 2) Work Environment, and 3) Community & Environment.

## 2. Mental Health Statistics by State 2024

In [6]:
response2 = requests.get("https://worldpopulationreview.com/state-rankings/mental-health-statistics-by-state")
soup2 = BeautifulSoup(response2.text, "html.parser")
soup2.title

<title>Mental Health Statistics by State 2025</title>

In [7]:
# find the table
table2 = soup2.find("table")

# Fetch table head
headers2 = [th.text.strip() for th in table2.find("thead").find_all("th") if th.text.strip()]

# Extract table data
data2 = []
rows2 = table2.find("tbody").find_all("tr")
for row in rows2:
    cols = row.find_all("td")
    if cols:
        values = [col.text.strip() for col in cols[1:]]
        data2.append(values)

# Create DataFrame
mental_illness = pd.DataFrame(data2, columns=headers2)

In [8]:
mental_illness

Unnamed: 0,State,Rates Of Mental Illnessâ,Adults With Anxiety Or Depression,Adults With Severe Mental Illness,Overall Mental Health Standing (Youth & Adults)
0,Utah,29.68%,32.1%,6.3%,42.0
1,Oregon,27.33%,32.6%,5.7%,45.0
2,West Virginia,26.05%,37.9%,5.8%,37.0
3,Kansas,26.02%,32%,5.7%,40.0
4,Oklahoma,25.59%,33.9%,5.4%,32.0
5,Washington,25.51%,31.3%,5.5%,31.0
6,Idaho,24.92%,35.9%,5.3%,49.0
7,Ohio,24.32%,35.2%,6.3%,24.0
8,Rhode Island,24.12%,25.1%,5.1%,11.0
9,Arizona,23.89%,33.2%,5.6%,48.0


In [9]:
# Rename columns
mental_illness.columns = ["state", "rates of mental illness", "adults with anxiety or depression",
                          "adults with severe mental illness", "overll mental health standing (youth & adults)"]
mental_illness

Unnamed: 0,state,rates of mental illness,adults with anxiety or depression,adults with severe mental illness,overll mental health standing (youth & adults)
0,Utah,29.68%,32.1%,6.3%,42.0
1,Oregon,27.33%,32.6%,5.7%,45.0
2,West Virginia,26.05%,37.9%,5.8%,37.0
3,Kansas,26.02%,32%,5.7%,40.0
4,Oklahoma,25.59%,33.9%,5.4%,32.0
5,Washington,25.51%,31.3%,5.5%,31.0
6,Idaho,24.92%,35.9%,5.3%,49.0
7,Ohio,24.32%,35.2%,6.3%,24.0
8,Rhode Island,24.12%,25.1%,5.1%,11.0
9,Arizona,23.89%,33.2%,5.6%,48.0


Overall Mental Health Standing is each state's combined score in 14 categories (7 adult and 7 youth), such as "Adults with Any Mental Illness" and "Youth With Severe Major Depressive Episodes (MDE)".

Higher standings (01-10) are preferable, and indicate less mental illness and greater access to care. Lower standings (38-50) indicate higher prevalence of mental illness and decreased access to care.

## 3. Suicide Rates by State 2024

In [10]:
response3 = requests.get("https://worldpopulationreview.com/state-rankings/suicide-rates-by-state")
soup3 = BeautifulSoup(response3.text, "html.parser")
soup3.title

<title>Suicide Rates by State 2025</title>

In [11]:
# find the table
table3 = soup3.find("table")

# Fetch table head
headers3 = [th.text.strip() for th in table3.find("thead").find_all("th") if th.text.strip()]

# Extract table data
data3 = []
rows3 = table3.find("tbody").find_all("tr")
for row in rows3:
    cols = row.find_all("td")
    if cols:
        values = [col.text.strip() for col in cols[1:]]
        data3.append(values)

# Create DataFrame
suicide = pd.DataFrame(data3, columns=headers3)

In [12]:
suicide

Unnamed: 0,State,Suicide Rateâ,Suicides
0,Wyoming,32.3,190
1,Montana,32.0,350
2,Alaska,30.8,220
3,New Mexico,25.0,533
4,South Dakota,23.2,203
5,Colorado,22.8,1384
6,Oklahoma,22.1,877
7,Nevada,21.5,691
8,North Dakota,20.8,156
9,Arkansas,20.6,618


In [13]:
# Rename columns
suicide.columns = ["state", "suicide rate (per 100k)", "sucides"]
suicide

Unnamed: 0,state,suicide rate (per 100k),sucides
0,Wyoming,32.3,190
1,Montana,32.0,350
2,Alaska,30.8,220
3,New Mexico,25.0,533
4,South Dakota,23.2,203
5,Colorado,22.8,1384
6,Oklahoma,22.1,877
7,Nevada,21.5,691
8,North Dakota,20.8,156
9,Arkansas,20.6,618


Suicide Rate is the number of completed suicide attempts per 100,000 residents.

In [14]:
suicide["suicide rate (per 100k)"] = suicide["suicide rate (per 100k)"].astype(float)
suicide["suicide rate (%)"] = (suicide["suicide rate (per 100k)"] / 1000).round(4)
suicide

Unnamed: 0,state,suicide rate (per 100k),sucides,suicide rate (%)
0,Wyoming,32.3,190,0.0323
1,Montana,32.0,350,0.032
2,Alaska,30.8,220,0.0308
3,New Mexico,25.0,533,0.025
4,South Dakota,23.2,203,0.0232
5,Colorado,22.8,1384,0.0228
6,Oklahoma,22.1,877,0.0221
7,Nevada,21.5,691,0.0215
8,North Dakota,20.8,156,0.0208
9,Arkansas,20.6,618,0.0206


# Merge dataframes

In [15]:
df = happiness.merge(mental_illness, on="state")
df.head()

Unnamed: 0,state,total happiness score,emotional & physical well-being rank,community & environment rank,work environment rank,rates of mental illness,adults with anxiety or depression,adults with severe mental illness,overll mental health standing (youth & adults)
0,Hawaii,66.31,1,7,17,17.86%,23.9%,4.2%,14
1,Maryland,62.6,5,4,24,17.8%,29.1%,4.5%,9
2,Minnesota,62.43,10,9,2,23.23%,26.8%,4.9%,10
3,Utah,62.41,29,1,1,29.68%,32.1%,6.3%,42
4,New Jersey,61.71,2,19,28,18.27%,24.4%,4.1%,2


In [16]:
df2 = df.merge(suicide, on="state")
df2.head()

Unnamed: 0,state,total happiness score,emotional & physical well-being rank,community & environment rank,work environment rank,rates of mental illness,adults with anxiety or depression,adults with severe mental illness,overll mental health standing (youth & adults),suicide rate (per 100k),sucides,suicide rate (%)
0,Hawaii,66.31,1,7,17,17.86%,23.9%,4.2%,14,13.7,202,0.0137
1,Maryland,62.6,5,4,24,17.8%,29.1%,4.5%,9,9.7,620,0.0097
2,Minnesota,62.43,10,9,2,23.23%,26.8%,4.9%,10,13.9,808,0.0139
3,Utah,62.41,29,1,1,29.68%,32.1%,6.3%,42,20.1,643,0.0201
4,New Jersey,61.71,2,19,28,18.27%,24.4%,4.1%,2,7.1,688,0.0071


In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 12 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   state                                           50 non-null     object 
 1   total happiness score                           50 non-null     object 
 2   emotional & physical well-being rank            50 non-null     object 
 3   community & environment rank                    50 non-null     object 
 4   work environment rank                           50 non-null     object 
 5   rates of mental illness                         50 non-null     object 
 6   adults with anxiety or depression               50 non-null     object 
 7   adults with severe mental illness               50 non-null     object 
 8   overll mental health standing (youth & adults)  50 non-null     object 
 9   suicide rate (per 100k)                      

In [18]:
psychological_condition = df2[["state", "total happiness score","rates of mental illness",
                              "suicide rate (%)"]]

psychological_condition = psychological_condition.rename(columns={
    "total happiness score":"happiness score",
    "rates of mental illness":"mental illness rate (%)"})

psychological_condition["happiness score"] = psychological_condition["happiness score"].astype(float)
psychological_condition["mental illness rate (%)"] = psychological_condition["mental illness rate (%)"].str.rstrip("%").astype(float)

psychological_condition

Unnamed: 0,state,happiness score,mental illness rate (%),suicide rate (%)
0,Hawaii,66.31,17.86,0.0137
1,Maryland,62.6,17.8,0.0097
2,Minnesota,62.43,23.23,0.0139
3,Utah,62.41,29.68,0.0201
4,New Jersey,61.71,18.27,0.0071
5,Idaho,61.6,24.92,0.0205
6,California,59.97,20.49,0.0101
7,Illinois,58.59,20.72,0.0111
8,Nebraska,58.19,23.41,0.015
9,Connecticut,58.15,18.77,0.01


Happiness scores were determined by evaluating 30 indicators across three main categories: 1) Emotional & Physical Well-Being, 2) Work Environment, and 3) Community & Environment.

In [19]:
psychological_condition.to_csv("../Data/psychological_condition.csv", index=False, encoding="utf-8-sig")

# Merge haunted_joined.csv

In [20]:
# Read haunted_joined.csv
haunted_joined = pd.read_csv("../Data/haunted_joined.csv")

In [21]:
# Check the number of unique state
haunted_joined["state"].nunique()

51

In [22]:
haunted_joined["state"].unique()

array(['Michigan', 'Pennsylvania', 'California', 'Massachusetts',
       'Arkansas', 'Oregon', 'Arizona', 'Maryland', 'Oklahoma', 'Maine',
       'Alaska', 'Louisiana', 'Alabama', 'Ohio', 'New Jersey',
       'Washington DC', 'North Dakota', 'Wyoming', 'North Carolina',
       'Kentucky', 'Kansas', 'New York', 'Wisconsin', 'Iowa',
       'West Virginia', 'New Mexico', 'Washington', 'Illinois', 'Indiana',
       'Virginia', 'Idaho', 'New Hampshire', 'Vermont', 'Nevada',
       'Hawaii', 'Utah', 'Nebraska', 'Georgia', 'Texas', 'Montana',
       'Tennessee', 'Florida', 'Missouri', 'Delaware', 'South Dakota',
       'Mississippi', 'Connecticut', 'Minnesota', 'South Carolina',
       'Rhode Island', 'Colorado'], dtype=object)

In [23]:
# Check the number of unique state
psychological_condition.nunique()

state                      50
happiness score            50
mental illness rate (%)    49
suicide rate (%)           43
dtype: int64

In [24]:
psychological_condition["state"].unique()

array(['Hawaii', 'Maryland', 'Minnesota', 'Utah', 'New Jersey', 'Idaho',
       'California', 'Illinois', 'Nebraska', 'Connecticut', 'Virginia',
       'South Dakota', 'North Dakota', 'Massachusetts', 'New Hampshire',
       'Iowa', 'Delaware', 'Florida', 'Georgia', 'North Carolina',
       'Wisconsin', 'Washington', 'New York', 'Maine', 'Wyoming',
       'Oregon', 'Pennsylvania', 'Rhode Island', 'Montana', 'Colorado',
       'Arizona', 'Kansas', 'South Carolina', 'Vermont', 'Nevada',
       'Texas', 'Indiana', 'Ohio', 'Michigan', 'Alaska', 'Missouri',
       'New Mexico', 'Tennessee', 'Oklahoma', 'Mississippi', 'Alabama',
       'Kentucky', 'Arkansas', 'Louisiana', 'West Virginia'], dtype=object)

In [25]:
# Add state "Washington DC"
washington_row = psychological_condition[psychological_condition["state"] == "Washington"].copy()

washington_row["state"] = "Washington DC"

psychological_condition = pd.concat([psychological_condition, washington_row], ignore_index=True)

psychological_condition["state"].nunique()

51

In [26]:
psychological_condition

Unnamed: 0,state,happiness score,mental illness rate (%),suicide rate (%)
0,Hawaii,66.31,17.86,0.0137
1,Maryland,62.6,17.8,0.0097
2,Minnesota,62.43,23.23,0.0139
3,Utah,62.41,29.68,0.0201
4,New Jersey,61.71,18.27,0.0071
5,Idaho,61.6,24.92,0.0205
6,California,59.97,20.49,0.0101
7,Illinois,58.59,20.72,0.0111
8,Nebraska,58.19,23.41,0.015
9,Connecticut,58.15,18.77,0.01


In [27]:
# merge dataframes
haunted_joined2 = haunted_joined.merge(psychological_condition, on="state")
haunted_joined2.head()

Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,...,2025 Annual Average Daylight Duration (Capital hrs),MonthlyMeanTemperature,TemperatureLevel,MonthlyTotalLiquidPrecipitation,RainLevel,MonthlyAverageWindSpeed,WindLevel,happiness score,mental illness rate (%),suicide rate (%)
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.49548,42.960727,...,12.2,26.2,Cold,1.8,Dry,9.4,Strong,46.51,22.33,0.0143
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,...,12.2,26.2,Cold,1.8,Dry,9.4,Strong,46.51,22.33,0.0143
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,...,12.2,26.2,Cold,1.8,Dry,9.4,Strong,46.51,22.33,0.0143
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,...,12.2,26.2,Cold,1.8,Dry,9.4,Strong,46.51,22.33,0.0143
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.75303,42.243097,...,12.2,26.2,Cold,1.8,Dry,9.4,Strong,46.51,22.33,0.0143


In [28]:
haunted_joined2.to_csv("../Data/haunted_joined2.csv", index=False, encoding="utf-8-sig")