In [1]:
import pandas as pd

### Preprocess Wave 7 Data - Filter required variables 
This data will answer Question 1 and Question 2

In [2]:
# Read file as a DataFrame
wave7_csv = "data/wave_7.csv"
wave7 = pd.read_csv(wave7_csv, low_memory=False)

In [3]:
# Extract columns of interest
columns_to_extract = [
    'B_COUNTRY', 
    'A_YEAR', 
    'Q65', 
    'Q67', 
    'Q69', 
    'Q70', 
    'Q71', 
    'Q72', 
    'Q74', 
    'Q76', 
    'Q250', 
    'Q275', 
    'Q262', 
    'Q288', 
    'Q260', 
    'Q235', 
    'Q236', 
    'Q209', 
    'Q212', 
    ]
extracted_wave_7_data = wave7[columns_to_extract]
print(extracted_wave_7_data.head())

   B_COUNTRY  A_YEAR  Q65  Q67  Q69  Q70  Q71  Q72  Q74  Q76  Q250  Q275  \
0         20    2018   -4    1    1    1    1    1    1    1     8     3   
1         20    2018   -4    3    3    3    4    4    3    3    10     7   
2         20    2018   -4    4    2    2    3    3    3    3    10     7   
3         20    2018   -4    3    3    3    3    3    3    3     7     2   
4         20    2018   -4    3    2    2    2    3    3    3     8     2   

   Q262  Q288  Q260  Q235  Q236  Q209  Q212  
0    60     5     2     4     4     2     3  
1    47     9     1     4     4     1     2  
2    48     5     1     4     2     1     2  
3    62     4     2     3     2     2     2  
4    49     4     1     3     3     1     1  


In [4]:
# Rename the columns for better readability
extracted_wave_7_data.rename(columns = {
    'B_COUNTRY' : 'Country',
    'A_YEAR' : 'Year',
    'Q65' : 'C Armed forces',
    'Q69' : 'C Police',
    'Q70' : 'C Courts',
    'Q71' : 'C Government',
    'Q72' : 'C Political parties',
    'Q74' : 'C Civil services',
    'Q67' : 'C Television',
    'Q76' : 'C Elections',
    'Q250' : 'Importance of democracy',
    'Q275' : 'Highest educational level',
    'Q262' : 'Age',
    'Q288' : 'Scale of incomes',
    'Q260' : 'Sex',
    'Q235' : 'Strong Leader',
    'Q236' : 'Expert Non Govt Person',
    'Q209' : 'Signing a petition',
    'Q212' : 'Joining unofficial strikes',
}, inplace=True)

print(extracted_wave_7_data.head())

   Country  Year  C Armed forces  C Television  C Police  C Courts  \
0       20  2018              -4             1         1         1   
1       20  2018              -4             3         3         3   
2       20  2018              -4             4         2         2   
3       20  2018              -4             3         3         3   
4       20  2018              -4             3         2         2   

   C Government  C Political parties  C Civil services  C Elections  \
0             1                    1                 1            1   
1             4                    4                 3            3   
2             3                    3                 3            3   
3             3                    3                 3            3   
4             2                    3                 3            3   

   Importance of democracy  Highest educational level  Age  Scale of incomes  \
0                        8                          3   60              

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_wave_7_data.rename(columns = {


In [5]:
# Save the extracted data to a new file
output_path = "data/preprocessed/filtered_wave_7.csv"
extracted_wave_7_data.to_csv(output_path, index=False)

### Extract data for time series analysis

This data will answer question 3

In [6]:
# Read the time series data
time_series_csv = "data/time_series_1981_2022.csv"
time_series = pd.read_csv(time_series_csv, low_memory=False)

In [7]:
# Filter time series for only WVS
filtered_time_series = time_series[time_series['S001'] == 2]

# The result fase means that we have all data for WVS

print((filtered_time_series['S001'] == 1).any())

False


In [8]:
# Extract columns of interest
columns_to_extract_time_series = [
    'S001', 
    'S002VS', 
    'S003', 
    'COUNTRY_ALPHA', 
    'S020', 
    'E069_02', 
    'E069_06', 
    'E069_07', 
    'E069_08', 
    'E069_10', 
    'E069_11', 
    'E069_12', 
    'E069_17', 
    'E069_46', 
    'E069_49', 
    'E069_64', 
    'E069_65', 
    'E119',
    'X025',
    'X003',
    'X047_WVS'
]

extracted_time_series_data = filtered_time_series[columns_to_extract_time_series]
print(extracted_time_series_data.head())

   S001  S002VS  S003 COUNTRY_ALPHA  S020  E069_02  E069_06  E069_07  E069_08  \
0     2       3     8           ALB  1998        3        3        3        2   
1     2       3     8           ALB  1998        3        3        3        3   
2     2       3     8           ALB  1998        3        3        3        3   
3     2       3     8           ALB  1998        2        2        2        2   
4     2       3     8           ALB  1998        3        3        3        3   

   E069_10  ...  E069_12  E069_17  E069_46  E069_49  E069_64  E069_65  E119  \
0        3  ...        3        3       -4       -4       -4       -4     1   
1        3  ...        3        3       -4       -4       -4       -4     1   
2        3  ...        3        3       -4       -4       -4       -4     1   
3        3  ...        3        3       -4       -4       -4       -4     1   
4        3  ...        3        3       -4       -4       -4       -4     1   

   X025  X003  X047_WVS  
0     3    1

In [9]:
# Rename the columns for better readability
extracted_time_series_data.rename(columns = {
    'S001' : 'Study',
    'S002VS' : 'WVS wave',
    'S003' : 'Country',
    'COUNTRY_ALPHA': 'Country name',
    'S020': 'Survey year',
    'E069_02': 'C Armed Forces ',
    'E069_06': 'C Police',
    'E069_07': 'C Parliament',
    'E069_08': 'C Civil Services',
    'E069_10': 'C Television',
    'E069_11': 'C Government',
    'E069_12': 'C Political Parties',
    'E069_17':'C Courts',
    'E069_46': 'C NGOs',
    'E069_49': 'C TV News',
    'E069_64': 'C Elections',
    'E069_65': 'C International Criminal Court',
    'E119' : 'Government order vs. freedom',
    'X025':'Highest educational level',
    'X003':'Age',
    'X047_WVS':'Scale of incomes',
}, inplace=True)

print(extracted_time_series_data.head())

   Study  WVS wave  Country Country name  Survey year  C Armed Forces   \
0      2         3        8          ALB         1998                3   
1      2         3        8          ALB         1998                3   
2      2         3        8          ALB         1998                3   
3      2         3        8          ALB         1998                2   
4      2         3        8          ALB         1998                3   

   C Police  C Parliament  C Civil Services  C Television  ...  \
0         3             3                 2             3  ...   
1         3             3                 3             3  ...   
2         3             3                 3             3  ...   
3         2             2                 2             3  ...   
4         3             3                 3             3  ...   

   C Political Parties  C Courts  C NGOs  C TV News  C Elections  \
0                    3         3      -4         -4           -4   
1                    3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extracted_time_series_data.rename(columns = {


In [10]:
# Save the extracted data to a new file
output_path = "data/preprocessed/filtered_time_series_1981_2022.csv"
extracted_time_series_data.to_csv(output_path, index=False)