In [41]:
import pandas as pd

# Load the CSV file
# Replace 'file.csv' with the path to your CSV file
file_path = 'worldmarriage_edited.csv'
data = pd.read_csv(file_path)

  data = pd.read_csv(file_path)


In [15]:
def calculate_rates(df, group_by_columns):
    """Calculates rates of MaritalStatus."""
    grouped = df.groupby(group_by_columns)['DataValue'].sum().reset_index()
    if len(group_by_columns) > 1:
        total = grouped.groupby(group_by_columns[:-1])['DataValue'].transform('sum')
    else:
        total = grouped['DataValue'].sum()
    grouped['Rate'] = grouped['DataValue'] / total * 100  # Convert to percentages
    if len(group_by_columns) > 1:
        return grouped.pivot(index=group_by_columns[:-1], columns='MaritalStatus', values='Rate')
    else:
        return grouped.set_index(group_by_columns)['Rate']

In [16]:
# Overall rates
overall_rates = calculate_rates(data, ['MaritalStatus'])
print("Overall Marital Status Rates:")
print(overall_rates)


Overall Marital Status Rates:
MaritalStatus
Consensual union                                  1.793444
Consensual union, not living together             0.044541
Currently not married                             0.305211
Currently not married nor in consensual union     0.019183
Divorced                                          3.491321
Divorced or Separated                             0.249871
Divorced or Separated or Widowed                  0.014957
Divorced or Widowed                               0.005183
Ever married                                      0.562306
Living together                                   0.961968
Marriage contract                                 0.000938
Married                                          47.621955
Married gaunna not performed                      0.000118
Married monogamous                                0.045427
Married or Living together                        0.817610
Married or in consensual union                    2.141617
Married or m

In [49]:
# Combine categories into broader groups
def combine_marital_status_rates(rates):
    # Initialize categories
    combined = {
        "Married": rates[rates.index.str.contains("married", case=False)].sum(),
        "Divorced": rates[rates.index.str.contains("divorced", case=False)].sum(),
        "Widowed": rates[rates.index.str.contains("widowed", case=False)].sum()
    }
    # Calculate "Other" as the remaining percentage
    combined["Other"] = 100 - sum(combined.values())
    return pd.Series(combined)

# Apply to overall rates
overall_combined_rates = combine_marital_status_rates(overall_rates)
print("Combined Overall Marital Status Rates:")
print(overall_combined_rates)


Combined Overall Marital Status Rates:
Married     56.673239
Divorced     3.819619
Widowed      7.956958
Other       31.550184
dtype: float64


In [50]:
# Rates for specific countries
countries_of_interest = [
    "Mexico", "United States of America", "Argentina", "United Kingdom", "Russian Federation", "Ukraine",
    "France", "Turkey", "State of Palestine", "Egypt", "Iran (Islamic Republic of)", "Azerbaijan",
    "Nigeria", "Republic of Korea", "China", "India", "Thailand", "Indonesia"
]  # Replace with your list of countries
country_rates = calculate_rates(data[data['Country or area'].isin(countries_of_interest)], 
                                ['Country or area', 'MaritalStatus'])

combined_country_rates = country_rates.apply(combine_marital_status_rates, axis=1)
print("\nCombined Marital Status Rates for Specific Countries:")
print(combined_country_rates)


Combined Marital Status Rates for Specific Countries:
                              Married  Divorced    Widowed      Other
Country or area                                                      
Argentina                   49.271553  2.775113   8.358394  39.594940
Azerbaijan                  70.477525  1.563610  10.859633  17.099232
China                        4.987555  0.079601   0.391239  94.541605
Egypt                       77.008092  1.039964  13.491101   8.460843
France                      56.637019  3.976218   8.519894  30.866868
India                       75.625763  1.019135  12.803192  10.551909
Indonesia                   71.932280  2.284918  13.210676  12.572126
Iran (Islamic Republic of)  69.662442  2.046784   7.207446  21.083328
Mexico                      49.491859  2.119148   8.064253  40.324740
Nigeria                     81.652162  1.396516   7.236282   9.715040
Republic of Korea           60.213971  1.594426  13.840622  24.350981
Russian Federation          71.3305

In [51]:
# Rates for specific age groups and countries
age_groups_of_interest = ['[15-19]', '[20-24]','[25-29]', '[30-34]', '[35-39]', 
                          '[40-44]', '[45-49]', '[50-54]', '[55-59]', '[60-64]', '[65+]']  # Replace with your list of age groups
age_country_rates = calculate_rates(
    data[(data['Country or area'].isin(countries_of_interest)) & 
         (data['AgeGroup'].isin(age_groups_of_interest))],
    ['Country or area', 'AgeGroup', 'MaritalStatus']
)

combined_age_country_rates = age_country_rates.apply(combine_marital_status_rates, axis=1)
print("\nCombined Marital Status Rates for Specific Age Groups and Countries:")
print(combined_age_country_rates)


Combined Marital Status Rates for Specific Age Groups and Countries:
                                     Married   Divorced    Widowed      Other
Country or area          AgeGroup                                            
Argentina                [15-19]    2.362205   0.087489   0.049994  97.500312
                         [20-24]   18.530735   0.509745   0.109945  80.849575
                         [25-29]   42.293083   1.309476   0.229908  56.167533
                         [30-34]   57.214279   2.289771   0.479952  40.015998
                         [35-39]   63.830851   3.239028   0.939718  31.990403
...                                      ...        ...        ...        ...
United States of America [45-49]   67.736453  14.057189   1.989602  16.216757
                         [50-54]   67.800000  14.900000   3.210000  14.090000
                         [55-59]   70.260621  13.223849   6.226348  10.289182
                         [60-64]   68.978571  12.128571  10.085714   8.8

In [25]:
# Overall rates by gender
gender_overall_rates = calculate_rates(data, ['Sex', 'MaritalStatus'])
gender_combined_rates = gender_overall_rates.apply(combine_marital_status_rates, axis=1)
print("Combined Marital Status Rates by Gender:")
print(gender_combined_rates)




Combined Marital Status Rates by Gender:
         Married  Divorced    Widowed       Other
Sex                                              
1970    0.000000  0.000000   0.000000  100.000000
1971    0.000000  0.000000   0.000000  100.000000
1980    0.000000  0.000000   0.000000  100.000000
1981    0.000000  0.000000   0.000000  100.000000
1986    0.000000  0.000000   0.000000  100.000000
1990    0.000000  0.000000   0.000000  100.000000
1991    0.000000  0.000000   0.000000  100.000000
1996    0.000000  0.000000   0.000000  100.000000
2000    0.000000  0.000000   0.000000  100.000000
2001    0.000000  0.000000   0.000000  100.000000
2006    0.000000  0.000000   0.000000  100.000000
2011    0.000000  0.000000   0.000000  100.000000
2016    0.000000  0.000000   0.000000  100.000000
Men    62.340968  3.263234   3.623565   30.772233
Women  56.268504  4.665692  12.673861   26.391943


In [26]:

# Rates for specific countries by gender
countries_of_interest = ['USA', 'Canada', 'India']  # Replace with your list of countries
gender_country_rates = calculate_rates(
    data[data['Country or area'].isin(countries_of_interest)],
    ['Country or area', 'Sex', 'MaritalStatus']
)

gender_combined_country_rates = gender_country_rates.apply(combine_marital_status_rates, axis=1)
print("\nCombined Marital Status Rates for Specific Countries by Gender:")
print(gender_combined_country_rates)




Combined Marital Status Rates for Specific Countries by Gender:
                         Married  Divorced    Widowed      Other
Country or area Sex                                             
Canada          Men    57.464984  3.802974   3.358511  35.373531
                Women  51.949868  5.180414  11.623313  31.246405
India           Men    79.277289  0.543292   6.027955  14.151465
                Women  72.441588  1.434076  18.711282   7.413053


In [40]:
# Function to combine marital statuses into broader categories
def combine_marital_status_rates_fixed(group):
    """Combines marital statuses into broader categories."""
    # Ensure 'MaritalStatus' and 'Rate' are present
    rates = group.set_index('MaritalStatus')['Rate']
    combined = {
        "Married": rates[rates.index.str.contains("married", case=False)].sum(),
        "Divorced": rates[rates.index.str.contains("divorced", case=False)].sum(),
        "Widowed": rates[rates.index.str.contains("widowed", case=False)].sum()
    }
    # Calculate "Other" as the remaining percentage
    combined["Other"] = 100 - sum(combined.values())
    return pd.Series(combined)

# Filter data for specific age groups and countries
filtered_data = data[
    (data['Country or area'].isin(countries_of_interest)) &
    (data['AgeGroup'].isin(age_groups_of_interest))
]

# Calculate rates for specific age groups, countries, and genders
gender_age_country_rates = calculate_rates(
    filtered_data,
    ['Country or area', 'AgeGroup', 'Sex', 'MaritalStatus']
).reset_index()

# Combine rates into broader categories for each gender, age group, and country
gender_combined_age_country_rates = (
    gender_age_country_rates.groupby(['Country or area', 'AgeGroup', 'Sex'], as_index=False)
    .apply(lambda group: combine_marital_status_rates_fixed(group))
)

print("\nCombined Marital Status Rates for Specific Age Groups, Countries, and Gender:")
print(gender_combined_age_country_rates)



KeyError: "None of ['MaritalStatus'] are in the columns"

In [53]:
# List of age groups of interest
age_groups_of_interest = ['[15-19]', '[20-24]', '[25-29]', '[30-34]', '[35-39]', 
                          '[40-44]', '[45-49]', '[50-54]', '[55-59]', '[60-64]', '[65+]']

# List of countries of interest
countries_of_interest = [
    "Mexico", "United States of America", "Argentina", "United Kingdom", "Russian Federation", "Ukraine",
    "France", "Turkey", "State of Palestine", "Egypt", "Iran (Islamic Republic of)", "Azerbaijan",
    "Nigeria", "Republic of Korea", "China", "India", "Thailand", "Indonesia"
]

# Filter the dataset for the countries and age groups of interest
filtered_data = data[
    (data["Country or area"].isin(countries_of_interest)) &
    (data["AgeGroup"].isin(age_groups_of_interest))
]

# Categorize marital statuses into 'Married', 'Divorced', 'Widowed', and 'Other'
def categorize_marital_status(status):
    if "married" in status.lower():
        return "Married"
    elif "divorced" in status.lower():
        return "Divorced"
    elif "widowed" in status.lower():
        return "Widowed"
    else:
        return "Other"

filtered_data["MaritalCategory"] = filtered_data["MaritalStatus"].apply(categorize_marital_status)

# Group by country, age group, and marital category to calculate rates
grouped_data = filtered_data.groupby(["Country or area", "AgeGroup", "MaritalCategory"]).agg(
    Total=("DataValue", "sum")
).reset_index()

# Calculate total population for each country and age group
total_population = grouped_data.groupby(["Country or area", "AgeGroup"])["Total"].transform("sum")
grouped_data["Rate"] = (grouped_data["Total"] / total_population) * 100  # Convert to percentage

# Pivot table to organize marital status rates
pivot_table = grouped_data.pivot_table(
    index=["Country or area", "AgeGroup"],
    columns="MaritalCategory",
    values="Rate",
    fill_value=0
).reset_index()

# Add "Other" category as 100% - sum of 'Married', 'Divorced', and 'Widowed'
pivot_table["Other"] = 100 - pivot_table[["Married", "Divorced", "Widowed"]].sum(axis=1)

# Calculate average rates for all countries
average_rates = pivot_table.drop(columns=["Country or area"]).groupby("AgeGroup").mean().reset_index()

# Save the results to CSVs
pivot_table.to_csv("marital_status_rates_by_country.csv", index=False)
average_rates.to_csv("average_marital_status_rates.csv", index=False)

# Output the result
print("Marital status rates by country and age group saved to 'marital_status_rates_by_country.csv'")
print("Average marital status rates across countries saved to 'average_marital_status_rates.csv'")

Marital status rates by country and age group saved to 'marital_status_rates_by_country.csv'
Average marital status rates across countries saved to 'average_marital_status_rates.csv'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["MaritalCategory"] = filtered_data["MaritalStatus"].apply(categorize_marital_status)


In [57]:


# List of age groups of interest
age_groups_of_interest = ['[15-19]', '[20-24]', '[25-29]', '[30-34]', '[35-39]', 
                          '[40-44]', '[45-49]', '[50-54]', '[55-59]', '[60-64]', '[65+]']

# List of countries of interest
countries_of_interest = [
    "Mexico", "United States of America", "Argentina", "United Kingdom", "Russian Federation", "Ukraine",
    "France", "Turkey", "State of Palestine", "Egypt", "Iran (Islamic Republic of)", "Azerbaijan",
    "Nigeria", "Republic of Korea", "China", "India", "Thailand", "Indonesia"
]

# Filter the dataset for the countries and age groups of interest
filtered_data = data[
    (data["Country or area"].isin(countries_of_interest)) &
    (data["AgeGroup"].isin(age_groups_of_interest))
]

# Convert age groups to mean ages
def calculate_mean_age(age_group):
    if "+" in age_group:
        return 67.5  # Handle 65+ as 67.5
    start, end = map(int, age_group.strip("[]").split('-'))
    return (start + end) / 2

filtered_data["MeanAge"] = filtered_data["AgeGroup"].apply(calculate_mean_age)

# Categorize marital statuses into 'Married'
def categorize_marital_status(status):
    if "married" in status.lower():
        return "Married"
    return "Other"

filtered_data["MaritalCategory"] = filtered_data["MaritalStatus"].apply(categorize_marital_status)

# Filter for "Married" category and calculate mean percentage for each age group
married_data = filtered_data[filtered_data["MaritalCategory"] == "Married"]
grouped_data = married_data.groupby(["Country or area", "AgeGroup"]).agg(
    MeanAge=("MeanAge", "first"),  # Keep the mean age for the age group
    MarriedPercentage=("DataValue", "mean")  # Average percentage of married individuals
).reset_index()

# Map countries to continents
continent_mapping = {
    "Mexico": "North America", "United States of America": "North America",
    "Argentina": "South America", "United Kingdom": "Europe",
    "Russian Federation": "Europe", "Ukraine": "Europe",
    "France": "Europe", "Turkey": "Asia",
    "State of Palestine": "Asia", "Egypt": "Africa",
    "Iran (Islamic Republic of)": "Asia", "Azerbaijan": "Asia",
    "Nigeria": "Africa", "Republic of Korea": "Asia",
    "China": "Asia", "India": "Asia",
    "Thailand": "Asia", "Indonesia": "Asia"
}

grouped_data["Continent"] = grouped_data["Country or area"].map(continent_mapping)

# Save the result to a CSV
output_filename = "country_age_group_married_percentage.csv"
grouped_data.to_csv(output_filename, index=False)

# Output the result
print(f"Processed data saved to '{output_filename}'")


Processed data saved to 'country_age_group_married_percentage.csv'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["MeanAge"] = filtered_data["AgeGroup"].apply(calculate_mean_age)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["MaritalCategory"] = filtered_data["MaritalStatus"].apply(categorize_marital_status)


In [45]:
# List of age groups of interest
age_groups_of_interest = ['[15-19]', '[20-24]', '[25-29]', '[30-34]', '[35-39]', 
                          '[40-44]', '[45-49]', '[50-54]', '[55-59]', '[60-64]', '[65+]']

# List of countries of interest
countries_of_interest = [
    "Mexico", "United States of America", "Argentina", "United Kingdom", "Russian Federation", "Ukraine",
    "France", "Turkey", "State of Palestine", "Egypt", "Iran (Islamic Republic of)", "Azerbaijan",
    "Nigeria", "Republic of Korea", "China", "India", "Thailand", "Indonesia"
]

# Filter the dataset for the countries, age groups, and sexes of interest
filtered_data = data[
    (data["Country or area"].isin(countries_of_interest)) &
    (data["AgeGroup"].isin(age_groups_of_interest))
]

# Categorize marital statuses into 'Married', 'Divorced', 'Widowed', and 'Other'
def categorize_marital_status(status):
    if "married" in status.lower():
        return "Married"
    elif "divorced" in status.lower():
        return "Divorced"
    elif "widowed" in status.lower():
        return "Widowed"
    else:
        return "Other"

filtered_data["MaritalCategory"] = filtered_data["MaritalStatus"].apply(categorize_marital_status)

# Group by country, age group, sex, and marital category to calculate rates
grouped_data = filtered_data.groupby(["Country or area", "AgeGroup", "Sex", "MaritalCategory"]).agg(
    Total=("DataValue", "sum")
).reset_index()

# Calculate total population for each country, age group, and sex
total_population = grouped_data.groupby(["Country or area", "AgeGroup", "Sex"])["Total"].transform("sum")
grouped_data["Rate"] = (grouped_data["Total"] / total_population) * 100  # Convert to percentage

# Pivot table to organize marital status rates
pivot_table = grouped_data.pivot_table(
    index=["Country or area", "AgeGroup", "Sex"],
    columns="MaritalCategory",
    values="Rate",
    fill_value=0
).reset_index()

# Add "Other" category as 100% - sum of 'Married', 'Divorced', and 'Widowed'
pivot_table["Other"] = 100 - pivot_table[["Married", "Divorced", "Widowed"]].sum(axis=1)

# Calculate average rates for all countries by sex
average_rates = pivot_table.drop(columns=["Country or area"]).groupby(["AgeGroup", "Sex"]).mean().reset_index()

# Mapping for file-friendly country names
country_name_mapping = {
    "Mexico": "Mexico",
    "United States of America": "United States",
    "Argentina": "Argentina",
    "United Kingdom": "Britain",
    "Russian Federation": "Russia",
    "Ukraine": "Ukraine",
    "France": "France",
    "Turkey": "Turkey",
    "State of Palestine": "Palestine Ter.",
    "Egypt": "Egypt",
    "Iran (Islamic Republic of)": "Iran",
    "Azerbaijan": "Azerbaijan",
    "Nigeria": "Nigeria",
    "Republic of Korea": "South Korea",
    "China": "China",
    "India": "India",
    "Thailand": "Thailand",
    "Indonesia": "Indonesia"
}

# Replace country names in the dataset for consistency in output
pivot_table["Country or area"] = pivot_table["Country or area"].replace(country_name_mapping)

# Save the results to CSVs with updated naming conventions
pivot_table.to_csv("marital_status_rates_by_country_and_sex.csv", index=False)
average_rates.to_csv("average_marital_status_rates_by_sex.csv", index=False)

print("Marital status rates by country, age group, and sex saved to 'marital_status_rates_by_country_and_sex.csv'")
print("Average marital status rates by sex across countries saved to 'average_marital_status_rates_by_sex.csv'")

# Save the results to CSVs
pivot_table.to_csv("marital_status_rates_by_country_and_sex.csv", index=False)
average_rates.to_csv("average_marital_status_rates_by_sex.csv", index=False)

# Output the result
print("Marital status rates by country, age group, and sex saved to 'marital_status_rates_by_country_and_sex.csv'")
print("Average marital status rates by sex across countries saved to 'average_marital_sta'")

Marital status rates by country, age group, and sex saved to 'marital_status_rates_by_country_and_sex.csv'
Average marital status rates by sex across countries saved to 'average_marital_status_rates_by_sex.csv'
Marital status rates by country, age group, and sex saved to 'marital_status_rates_by_country_and_sex.csv'
Average marital status rates by sex across countries saved to 'average_marital_sta'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data["MaritalCategory"] = filtered_data["MaritalStatus"].apply(categorize_marital_status)
