# Scrape data

## 1. Happiness Score by State 2024

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
response = requests.get("https://worldpopulationreview.com/state-rankings/happiest-states")
soup = BeautifulSoup(response.text, "html.parser")
soup.title

<title>Happiest States 2024</title>

In [3]:
tables = soup.find_all("table")
state_table = tables[-1]
rows = state_table.find_all("tr")

In [4]:
data = []
for row in rows[1:]:  # Skip head
    state_col = row.find("th")  # Extract State column
    cols = row.find_all("td")  # Extract other data columns
    if state_col and cols:
        state = state_col.text.strip()  # Extract State name
        cols_text = [col.text.strip() for col in cols]  # Extracts text from other columns
        data.append([state] + cols_text)  # Put State at the beginning of the list

In [5]:
happiness = pd.DataFrame(data, columns=["state", "total happiness score", "emotional & physical well-being rank",
                                       "community & environment rank", "work environment rank"])
happiness

Unnamed: 0,state,total happiness score,emotional & physical well-being rank,community & environment rank,work environment rank
0,Hawaii,66.31,1,7,17
1,Maryland,62.6,5,4,24
2,Minnesota,62.43,10,9,2
3,Utah,62.41,29,1,1
4,New Jersey,61.71,2,19,28
5,Idaho,61.6,25,2,3
6,California,59.97,12,5,14
7,Illinois,58.59,3,34,33
8,Nebraska,58.19,13,22,7
9,Connecticut,58.15,7,28,21


Happiness scores were determined by evaluating 30 indicators across three main categories: 1) Emotional & Physical Well-Being, 2) Work Environment, and 3) Community & Environment.

## 2. Mental Health Statistics by State 2024

In [6]:
response2 = requests.get("https://worldpopulationreview.com/state-rankings/mental-health-statistics-by-state")
soup2 = BeautifulSoup(response2.text, "html.parser")
soup2.title

<title>Mental Health Statistics by State 2024</title>

In [7]:
tables2 = soup2.find_all("table")
state_table2 = tables2[-1]
rows2 = state_table2.find_all("tr")

In [8]:
data2 = []
for row in rows2[1:]: 
    state_col = row.find("th") 
    cols = row.find_all("td")  
    if state_col and cols:
        state = state_col.text.strip() 
        cols_text = [col.text.strip() for col in cols] 
        data2.append([state] + cols_text) 

In [9]:
mental_illness = pd.DataFrame(data2, columns=["state", "rates of mental illness", 
                                              "adults with anxiety or depression",
                                             "adults with severe mental illness",
                                             "overll mental health standing (youth & adults"])
mental_illness

Unnamed: 0,state,rates of mental illness,adults with anxiety or depression,adults with severe mental illness,overll mental health standing (youth & adults
0,Utah,29.68%,32.1%,6.3%,42.0
1,Oregon,27.33%,32.6%,5.7%,45.0
2,West Virginia,26.05%,37.9%,5.8%,37.0
3,Kansas,26.02%,32%,5.7%,40.0
4,Oklahoma,25.59%,33.9%,5.4%,32.0
5,Washington,25.51%,31.3%,5.5%,31.0
6,Idaho,24.92%,35.9%,5.3%,49.0
7,Ohio,24.32%,35.2%,6.3%,24.0
8,Rhode Island,24.12%,25.1%,5.1%,11.0
9,Arizona,23.89%,33.2%,5.6%,48.0


Overall Mental Health Standing is each state's combined score in 14 categories (7 adult and 7 youth), such as "Adults with Any Mental Illness" and "Youth With Severe Major Depressive Episodes (MDE)".

Higher standings (01-10) are preferable, and indicate less mental illness and greater access to care. Lower standings (38-50) indicate higher prevalence of mental illness and decreased access to care.

## 3. Suicide Rates by State 2024

In [10]:
response3 = requests.get("https://worldpopulationreview.com/state-rankings/suicide-rates-by-state")
soup3 = BeautifulSoup(response3.text, "html.parser")
soup3.title

<title>Suicide Rates by State 2024</title>

In [11]:
tables3 = soup3.find_all("table")
state_table3 = tables3[-1]
rows3 = state_table3.find_all("tr")

In [12]:
data3 = []
for row in rows3[1:]:  
    state_col = row.find("th")  
    cols = row.find_all("td")
    if state_col and cols:
        state = state_col.text.strip()  
        cols_text = [col.text.strip() for col in cols] 
        data3.append([state] + cols_text)  

In [13]:
suicide = pd.DataFrame(data3, columns=["state", "suicide rate (per 100k)", "sucides"])
suicide

Unnamed: 0,state,suicide rate (per 100k),sucides
0,Wyoming,32.3,190
1,Montana,32.0,350
2,Alaska,30.8,220
3,New Mexico,25.0,533
4,South Dakota,23.2,203
5,Colorado,22.8,1384
6,Oklahoma,22.1,877
7,Nevada,21.5,691
8,North Dakota,20.8,156
9,Arkansas,20.6,618


Suicide Rate is the number of completed suicide attempts per 100,000 residents.

In [14]:
suicide["suicide rate (per 100k)"] = suicide["suicide rate (per 100k)"].astype(float)
suicide["suicide rate (%)"] = (suicide["suicide rate (per 100k)"] / 1000).round(4)
suicide

Unnamed: 0,state,suicide rate (per 100k),sucides,suicide rate (%)
0,Wyoming,32.3,190,0.0323
1,Montana,32.0,350,0.032
2,Alaska,30.8,220,0.0308
3,New Mexico,25.0,533,0.025
4,South Dakota,23.2,203,0.0232
5,Colorado,22.8,1384,0.0228
6,Oklahoma,22.1,877,0.0221
7,Nevada,21.5,691,0.0215
8,North Dakota,20.8,156,0.0208
9,Arkansas,20.6,618,0.0206


# Merge dataframes

In [15]:
df = happiness.merge(mental_illness, on="state")
df.head()

Unnamed: 0,state,total happiness score,emotional & physical well-being rank,community & environment rank,work environment rank,rates of mental illness,adults with anxiety or depression,adults with severe mental illness,overll mental health standing (youth & adults
0,Hawaii,66.31,1,7,17,17.86%,23.9%,4.2%,14
1,Maryland,62.6,5,4,24,17.8%,29.1%,4.5%,9
2,Minnesota,62.43,10,9,2,23.23%,26.8%,4.9%,10
3,Utah,62.41,29,1,1,29.68%,32.1%,6.3%,42
4,New Jersey,61.71,2,19,28,18.27%,24.4%,4.1%,2


In [16]:
df2 = df.merge(suicide, on="state")
df2.head()

Unnamed: 0,state,total happiness score,emotional & physical well-being rank,community & environment rank,work environment rank,rates of mental illness,adults with anxiety or depression,adults with severe mental illness,overll mental health standing (youth & adults,suicide rate (per 100k),sucides,suicide rate (%)
0,Hawaii,66.31,1,7,17,17.86%,23.9%,4.2%,14,13.7,202,0.0137
1,Maryland,62.6,5,4,24,17.8%,29.1%,4.5%,9,9.7,620,0.0097
2,Minnesota,62.43,10,9,2,23.23%,26.8%,4.9%,10,13.9,808,0.0139
3,Utah,62.41,29,1,1,29.68%,32.1%,6.3%,42,20.1,643,0.0201
4,New Jersey,61.71,2,19,28,18.27%,24.4%,4.1%,2,7.1,688,0.0071


In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 12 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   state                                          50 non-null     object 
 1   total happiness score                          50 non-null     object 
 2   emotional & physical well-being rank           50 non-null     object 
 3   community & environment rank                   50 non-null     object 
 4   work environment rank                          50 non-null     object 
 5   rates of mental illness                        50 non-null     object 
 6   adults with anxiety or depression              50 non-null     object 
 7   adults with severe mental illness              50 non-null     object 
 8   overll mental health standing (youth & adults  50 non-null     object 
 9   suicide rate (per 100k)                        50 non-nu

In [18]:
psychological_condition = df2[["state", "total happiness score","rates of mental illness",
                              "suicide rate (%)"]]

psychological_condition = psychological_condition.rename(columns={
    "total happiness score":"happiness score",
    "rates of mental illness":"mental illness rate (%)"})

psychological_condition["happiness score"] = psychological_condition["happiness score"].astype(float)
psychological_condition["mental illness rate (%)"] = psychological_condition["mental illness rate (%)"].str.rstrip("%").astype(float)

psychological_condition

Unnamed: 0,state,happiness score,mental illness rate (%),suicide rate (%)
0,Hawaii,66.31,17.86,0.0137
1,Maryland,62.6,17.8,0.0097
2,Minnesota,62.43,23.23,0.0139
3,Utah,62.41,29.68,0.0201
4,New Jersey,61.71,18.27,0.0071
5,Idaho,61.6,24.92,0.0205
6,California,59.97,20.49,0.0101
7,Illinois,58.59,20.72,0.0111
8,Nebraska,58.19,23.41,0.015
9,Connecticut,58.15,18.77,0.01


Happiness scores were determined by evaluating 30 indicators across three main categories: 1) Emotional & Physical Well-Being, 2) Work Environment, and 3) Community & Environment.

In [19]:
psychological_condition.to_csv("../Data/psychological_condition.csv", index=False, encoding="utf-8-sig")