In [2]:
import pandas as pd

# Load the cleaned dataset
file_path = 'data/wave7.csv'  # Update this if necessary
df = pd.read_csv(file_path)

# Step 1: Check for the country variable
# Assuming the country code is stored in 'S003' or similar
if 'B_COUNTRY' in df.columns:
    country_var = 'B_COUNTRY'
    print("Country variable found: S003 (ISO Numeric Country Codes)")
else:
    raise ValueError("Country variable not found in the dataset. Check the codebook or dataset.")

# Step 2: Count respondents by country
country_counts = df[country_var].value_counts()

# Step 3: Display countries with respondent counts
print("\nRespondents by Country:")
print(country_counts)

# Save the country summary for reference
country_counts.to_csv('country_summary.csv', header=['Respondents'], index_label='Country_Code')
print("\nCountry summary saved as 'country_summary.csv'.")

# Optional: Filter dataset for a specific country for testing
selected_country_code = 840  # Replace with your chosen country code (e.g., 840 = USA)
df_selected_country = df[df[country_var] == selected_country_code]

# Display first few rows of the filtered dataset for the selected country
print(f"\nData for Selected Country (Code: {selected_country_code}):")
print(df_selected_country.head())


  df = pd.read_csv(file_path)


Country variable found: S003 (ISO Numeric Country Codes)

Respondents by Country:
B_COUNTRY
124    4018
360    3200
156    3036
826    2609
840    2596
       ... 
32     1003
152    1000
196    1000
858    1000
909     447
Name: count, Length: 66, dtype: int64

Country summary saved as 'country_summary.csv'.

Data for Selected Country (Code: 840):
                  version                        doi  A_WAVE  A_YEAR  A_STUDY  \
89769  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   
89770  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   
89771  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   
89772  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   
89773  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   

       B_COUNTRY B_COUNTRY_ALPHA  C_COW_NUM C_COW_ALPHA  D_INTERVIEW  ...  \
89769        840             USA          2         USA    840071001  ...   
89770   

In [3]:
import pandas as pd

# Load the dataset
file_path = 'data/wave7.csv'
df = pd.read_csv(file_path)

# Define the country variable
country_var = 'B_COUNTRY'  # ISO numeric country codes

# List of countries to consider
countries_to_analyze = [124, 840, 156, 792, 360]  # Canada, USA, China, Turkey, Indonesia

# Step 1: Count respondents for selected countries
selected_country_counts = df[country_var].value_counts().loc[countries_to_analyze]
print("Respondents in Selected Countries:")
print(selected_country_counts)

# Step 2: Filter dataset for a specific country
selected_country_code = 840  # Example: USA
df_selected_country = df[df[country_var] == selected_country_code]

# Step 3: Display the first few rows for the selected country
print(f"\nData for Selected Country (Code: {selected_country_code}):")
print(df_selected_country.head())

# Step 4: Save the filtered dataset
filtered_file_path = f'filtered_country_{selected_country_code}.csv'
df_selected_country.to_csv(filtered_file_path, index=False)
print(f"\nFiltered dataset for country {selected_country_code} saved as '{filtered_file_path}'.")


  df = pd.read_csv(file_path)


Respondents in Selected Countries:
B_COUNTRY
124    4018
840    2596
156    3036
792    2415
360    3200
Name: count, dtype: int64

Data for Selected Country (Code: 840):
                  version                        doi  A_WAVE  A_YEAR  A_STUDY  \
89769  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   
89770  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   
89771  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   
89772  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   
89773  6-0-0 (2024-04-30)  doi.org/10.14281/18241.24       7    2017        2   

       B_COUNTRY B_COUNTRY_ALPHA  C_COW_NUM C_COW_ALPHA  D_INTERVIEW  ...  \
89769        840             USA          2         USA    840071001  ...   
89770        840             USA          2         USA    840071002  ...   
89771        840             USA          2         USA    840071003  ...   
89772        840             USA  

In [15]:
import pandas as pd

# Load the Wave 7 dataset
wave7_data = pd.read_csv('data/final_cleaned_wave7_data.csv')  # Replace with your actual file path

# Selected variables based on the heatmap and analysis
selected_variables = [
    'Trust_in_Political_Parties',  # Trust in Political Parties
    'Confidence_in_Government',  # Confidence in Government
    'Confidence_in_Parliament',  # Confidence in Parliament
    'Social_Trust',   # Social Trust
    'Importance_of_Religion',  # Importance of Religion
    'Redistribution_as_Moral'   # Importance of Tradition
]

# Filter for the selected variables and group by country
summary_stats = wave7_data.groupby('Country')[selected_variables].mean()

# Rank countries based on the mean value of these variables
ranked_countries = summary_stats.mean(axis=1).sort_values(ascending=False)

# Select the top 5 countries based on the ranking
top_5_countries = ranked_countries.head(5)

# Display results
print("Top 5 countries based on selected variables:")
print(top_5_countries)

# Save the results to a file
top_5_countries.to_csv('top_5_countries.csv')


Top 5 countries based on selected variables:
Country
417    4.407500
762    4.328333
818    4.323472
586    4.310443
231    4.289837
dtype: float64


In [20]:
import pandas as pd

# Load the cleaned dataset with countries
file_path = 'data/final_cleaned_wave7_data.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Step 1: Group data by country and compute summary statistics
grouped_by_country = df.groupby('Country').mean()

# Step 2: Compute data completeness (percentage of non-missing data)
data_completeness = df.groupby('Country').count() / df.groupby('Country').size().values.reshape(-1, 1) * 100
data_completeness = data_completeness.mean(axis=1)

# Combine statistics and completeness into a single DataFrame
country_stats = grouped_by_country.copy()
country_stats['Data_Completeness'] = data_completeness

# Sort by data completeness and variability in trust variables
sorted_countries = country_stats.sort_values(
    by=['Data_Completeness', 'Trust_in_Political_Parties', 'Confidence_in_Government', 'Confidence_in_Parliament', 'Confidence_in_Judiciary', 'Confidence_in_Police'], 
    ascending=[False, False, False, False, False, False]
)

# Step 3: Select the top 4 diverse countries (based on regions and trust levels)
top_4_countries = sorted_countries.head(4)

# Display the selected countries and their statistics
print("Selected 4 Countries for Analysis:")
print(top_4_countries)

# Save the selected countries to a file
top_4_countries.to_csv('selected_4_countries_for_analysis.csv')
print("Selected countries saved as 'selected_4_countries_for_analysis.csv'.")


Selected 4 Countries for Analysis:
         Trust_in_Political_Parties  Confidence_in_Government  \
Country                                                         
762                        3.979167                  2.862500   
586                        3.944361                  2.741353   
50                         3.920000                  2.807500   
818                        3.636667                  2.891667   

         Confidence_in_Parliament  Confidence_in_Judiciary  \
Country                                                      
762                      2.749167                 2.221667   
586                      2.604010                 2.523810   
50                       2.464167                 2.665833   
818                      2.930000                 2.580833   

         Confidence_in_Police  Social_Trust  Trust_in_Family  \
Country                                                        
762                  2.630833      1.794167         1.058333   
586      

In [21]:
import pandas as pd

# Define G8 country codes based on the provided table
g8_country_codes = [840, 124, 250, 276, 380, 392, 643, 826]  # USA, Canada, France, Germany, Italy, Japan, Russia, UK

# Load the CSV file
file_path = 'data/final_cleaned_wave7_data.csv'  # Replace with your actual dataset path
df = pd.read_csv(file_path)

# Filter the dataset for G8 countries
g8_filtered_data = df[df['Country'].isin(g8_country_codes)]

# Save the filtered dataset to a new CSV file
g8_filtered_data.to_csv('data/g8_filtered_data.csv', index=False)

# Display confirmation and preview of the filtered data
print(f"Filtered dataset for G8 countries saved as 'g8_filtered_data.csv'.")
print("Preview of G8 filtered data:")
print(g8_filtered_data.head())


Filtered dataset for G8 countries saved as 'g8_filtered_data.csv'.
Preview of G8 filtered data:
       Trust_in_Political_Parties  Confidence_in_Government  \
10072                         1.0                       2.0   
10073                         5.0                       2.0   
10074                         1.0                       3.0   
10075                         2.0                       2.0   
10076                         1.0                       1.0   

       Confidence_in_Parliament  Confidence_in_Judiciary  \
10072                       2.0                      1.0   
10073                       1.0                      2.0   
10074                       3.0                      2.0   
10075                       3.0                      1.0   
10076                       1.0                      1.0   

       Confidence_in_Police  Social_Trust  Trust_in_Family  \
10072                   1.0           2.0              2.0   
10073                   1.0           2.