In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
plt.style.use('ggplot')
warnings.filterwarnings("ignore")

import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [4]:
wave7 = pd.read_excel("Wave_7.xlsx").dropna()
wave7.head()

Unnamed: 0,Year of survey,Q260: Sex,Age interval,Q173: Religious person,Q275R: Highest educational level,Q288R: Income level
0,2018,2,1,2,3,2
1,2018,1,1,-2,1,2
2,2018,1,3,1,1,2
3,2018,2,2,-2,3,3
4,2018,2,2,-2,3,1


In [6]:
wave7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2415 entries, 0 to 2414
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Year of survey                    2415 non-null   int64
 1   Q260: Sex                         2415 non-null   int64
 2   Age interval                      2415 non-null   int64
 3   Q173: Religious person            2415 non-null   int64
 4   Q275R: Highest educational level  2415 non-null   int64
 5   Q288R: Income level               2415 non-null   int64
dtypes: int64(6)
memory usage: 132.1 KB


In [120]:
columns = wave7.columns

for i in columns:
    print(i, wave7[i].unique())

Year of survey [2018]
Q260: Sex [2 1]
Age interval [ 1  3  2 -5]
Q173: Religious person [ 2 -2  1 -1 -5  3]
Q275R: Highest educational level [ 3  1  2 -2 -1]
Q288R: Income level [ 2  3  1 -2 -1]


In [121]:
wave7['Female'] = pd.get_dummies(wave7['Q260: Sex']).drop(columns=1).rename(columns={2: 'Female'})

wave7['Age interval'] = wave7['Age interval'].replace({1:1, 2:1, 3:0, -5:0})
wave7['Below 50'] = pd.get_dummies(wave7['Age interval']).drop(columns=0).rename(columns={0: 'Below 50'})

wave7['Q173: Religious person'] = wave7['Q173: Religious person'].replace({2:0, 3:0, -2:0, -5:0, -1:0})
wave7['Religious'] = pd.get_dummies(wave7['Q173: Religious person']).drop(columns=0).rename(columns={1: 'Religious'})

wave7['Q275R: Highest educational level'] = wave7['Q275R: Highest educational level'].replace({-2:1, -1:1})
wave7[['Lower', 'Middle', 'Higher']] = pd.get_dummies(wave7['Q275R: Highest educational level']).rename(columns = {1: 'Lower', 
                                                                                                                   2: 'Middle', 
                                                                                                                   3: 'Higher'})
wave7['Q288R: Income level'] = wave7['Q288R: Income level'].replace({-2:1, -1:1})
wave7['Wealthy'] = pd.get_dummies(wave7['Q288R: Income level']).drop(columns=[1 , 2]).rename(columns={3: 'Wealthy'})

In [122]:
new_wave7 = wave7[["Year of survey", "Female", "Below 50", "Religious",  "Lower", "Middle", "Higher", "Wealthy"]]
new_wave7.head()

Unnamed: 0,Year of survey,Female,Below 50,Religious,Lower,Middle,Higher,Wealthy
0,2018,1,1,0,0,0,1,0
1,2018,0,1,0,1,0,0,0
2,2018,0,0,1,1,0,0,0
3,2018,1,1,0,0,0,1,1
4,2018,1,1,0,0,0,1,0


In [123]:
y = new_wave7['Religious']
X = new_wave7[['Female', 'Below 50', 'Lower', 'Higher', 'Wealthy']]

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:              Religious   R-squared (uncentered):                 -10.336
Model:                            OLS   Adj. R-squared (uncentered):            -10.359
Method:                 Least Squares   F-statistic:                             -439.5
Date:                Sat, 12 Jun 2021   Prob (F-statistic):                        1.00
Time:                        18:13:15   Log-Likelihood:                         -1834.7
No. Observations:                2415   AIC:                                      3679.
Df Residuals:                    2410   BIC:                                      3708.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [124]:
wave6 = pd.read_excel("Wave_6.xlsx").dropna()
wave6.head()

Unnamed: 0,Year of survey,Q260: Sex,Age interval,Q173: Religious person,Q275R: Highest educational level,Q288R: Income level
0,2011,2,2,1,3,7
1,2011,1,2,1,9,1
2,2011,1,3,1,9,2
3,2011,2,1,1,9,9
4,2011,2,3,1,3,4


In [125]:
columns = wave6.columns

for i in columns:
    print(i, wave6[i].unique())

Year of survey [2011]
Q260: Sex [2 1]
Age interval [2 3 1]
Q173: Religious person [ 1  2 -2 -1  3]
Q275R: Highest educational level [3 9 6 7 8 4 5 1 2]
Q288R: Income level [ 7  1  2  9  4  5  3  6  8 -1 -2 10]


In [126]:
wave6['Female'] = pd.get_dummies(wave6['Q260: Sex']).drop(columns=1).rename(columns={2: 'Female'})

wave6['Age interval'] = wave6['Age interval'].replace({2:1, 3:0})
wave6['Below 50'] = pd.get_dummies(wave6['Age interval']).drop(columns=0).rename(columns={0: 'Below 50'})

wave6['Q173: Religious person'] = wave6['Q173: Religious person'].replace({2:0, 3:0, -2:0, -1:0})
wave6['Religious'] = pd.get_dummies(wave6['Q173: Religious person']).drop(columns=0).rename(columns={1: 'Religious'})

wave6['Q275R: Highest educational level'] = wave6['Q275R: Highest educational level'].replace({2:1, 3:1, 4:1, 5:1, 6:2, 7:2, 8:3, 9:3})
wave6[['Lower', 'Middle', 'Higher']] = pd.get_dummies(wave6['Q275R: Highest educational level']).rename(columns = {1: 'Lower', 
                                                                                                                   2: 'Middle', 
                                                                                                                   3: 'Higher'})

wave6['Q288R: Income level'] = wave6['Q288R: Income level'].replace({-2:1, -1:2, 2:1, 3:1, 4:1, 5:2, 6:2, 7:2, 8:2, 9:3, 10:3})
wave6['Wealthy'] = pd.get_dummies(wave6['Q288R: Income level']).drop(columns=[1 , 2]).rename(columns={3: 'Wealthy'})

In [141]:
new_wave6 = wave6[["Year of survey", "Female", "Below 50", "Religious",  "Lower", "Middle", "Higher", "Wealthy"]]
new_wave6.head()

Unnamed: 0,Year of survey,Female,Below 50,Religious,Lower,Middle,Higher,Wealthy
0,2011,1,1,1,1,0,0,0
1,2011,0,1,1,0,0,1,0
2,2011,0,0,1,0,0,1,0
3,2011,1,1,1,0,0,1,1
4,2011,1,0,1,1,0,0,0


In [142]:
last_two_waves = pd.concat(objs = [new_wave7, new_wave6], ignore_index = True)
last_two_waves.head()

Unnamed: 0,Year of survey,Female,Below 50,Religious,Lower,Middle,Higher,Wealthy
0,2018,1,1,0,0,0,1,0
1,2018,0,1,0,1,0,0,0
2,2018,0,0,1,1,0,0,0
3,2018,1,1,0,0,0,1,1
4,2018,1,1,0,0,0,1,0


In [143]:
last_two_waves['is_2018'] = pd.get_dummies(last_two_waves['Year of survey']).drop(columns=2011).rename(columns={2018: 'is_2018'})

In [146]:
last_two_waves # appended dataframe

Unnamed: 0,Year of survey,Female,Below 50,Religious,Lower,Middle,Higher,Wealthy,is_2018
0,2018,1,1,0,0,0,1,0,1
1,2018,0,1,0,1,0,0,0,1
2,2018,0,0,1,1,0,0,0,1
3,2018,1,1,0,0,0,1,1,1
4,2018,1,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
4015,2011,1,1,1,0,1,0,0,0
4016,2011,0,1,1,0,1,0,0,0
4017,2011,0,1,0,0,1,0,0,0
4018,2011,0,1,0,0,1,0,0,0


In [144]:
y = last_two_waves['Religious']
X = last_two_waves[['is_2018', 'Female', 'Below 50', 'Lower', 'Higher', 'Wealthy']]

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:              Religious   R-squared (uncentered):                 -10.162
Model:                            OLS   Adj. R-squared (uncentered):            -10.179
Method:                 Least Squares   F-statistic:                             -609.1
Date:                Sat, 12 Jun 2021   Prob (F-statistic):                        1.00
Time:                        18:21:18   Log-Likelihood:                         -2939.1
No. Observations:                4020   AIC:                                      5890.
Df Residuals:                    4014   BIC:                                      5928.
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------