In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/gdrive',force_remount=True)

Mounted at /gdrive


Link to the GSS 2006 Codebook - https://www.thearda.com/Archive/Files/Codebooks/GSS2006_CB.asp

Link to the WVS Codebook- https://www.thearda.com/Archive/Files/Codebooks/WVS2010_CB.asp

In [None]:

gss_2006 = pd.read_csv('/gdrive/My Drive/Data Analysis GR5015/GSS_2006.csv')
wvs_data = pd.read_csv('/gdrive/My Drive/Data Analysis GR5015/WVS.csv')

  interactivity=interactivity, compiler=compiler, result=result)


### Question 1
***1. Run a multiple linear probability model.  Tell me how you think your independent variables will affect your dependent variable.  Interpret your results.  Were your expectations correct?  Why or why not?***


I am using the variable V78 from WVS data - Looking after the environment is important to this person; to care for nature and save life resources. Along with age and socio-economic-status (ses)

#### **Observations & Interpretations**
- In USA the richer one is the less is the probability they place high importance to environment and the older one gets the less is the probability they place high importance to environment- although none of this is conclusive because they're not statistically significant.

- From the coefficient we can say that a unit/category increase in X (Independent Variable) i.e ses, on an average dcreases the probability of Y (Dependent Variable) i.e placing **High** importance on Environment by 0.0022 units (0.22%) while keeping all other variables constant.

- From the coefficient we can say that a unit/category increase in X (Independent Variable) i.e age, on an average dcreases the probability of Y (Dependent Variable) i.e placing **High** importance on Environment by 0.0007 units (0.07%) while keeping all other variables constant.

- The intercept of 0.1818 tells us that when both ses and age are 0, the probability that someone might place **High** importance on Environment is 0.1818 units

- And this interpretation is  not at all statistically significant given the p-values are 0.574 and 0.132 (which is much higher than 0.05 i.e a rule of thumb to reject the NULL hypothesis)

#### **My expectation and if it worked out or not**

My intuition was that wealthier people, in USA, given the consumeristic and capitalistic nature of the economy and society, will have none to negative relevance to someone who places high importance to environment.  My intuition would be the opposite for European countries and a few asian countries!

Nothing conclusive came out of this interpretation, hence I cannot compare my intuitiomn to the outcome to make a conclusion on whether  it worked out or not.




In [None]:
#renaming columns to convenience and to add context
wvs_data.rename(columns={ 'V2':'country',
                          'V242':'age',
                          'V57':'marital',
                          'V240':'sex',
                          'V78':'environment_imp', # this asks- Is looking after environment important 
                          'V122':'conf_env_org', # this asks- how much confidence do they have in environmental organizations
                          'V82': 'donated_eco_org', # this asks if people have donated to eco orgs
                          'V239':'ses', ## this asks where you place yourself in the income distribution in your country 
                          'V32': 'mem_charitable_orgs'}, ## this asks if theyre a member of charitable orgs
                 inplace=True)

# Recoding
wvs_data['environment_imp']=7-(wvs_data['environment_imp']) ## reverse code environment_imp -  higher the value higher importance
wvs_data['conf_env_org']=5-(wvs_data['conf_env_org']) ## reverse code conf_env_org -  higher the value higher confidence in environmental orgs

# Encoding Marital status variable
conditions = [
    (wvs_data['marital'] ==1) ,
     (wvs_data['marital'] >1 )]
choices = [1,0]
wvs_data['married'] = np.select(conditions, choices, default=np.nan)

# Encoding Sex variable
conditions = [
    (wvs_data['sex'] ==2) ,
     (wvs_data['sex'] !=2 )]
choices = [1,0]
wvs_data['female'] = np.select(conditions, choices, default=np.nan)

# only keeping observations that answered about their SES level
wvs_data = wvs_data.dropna(subset = ["ses"])

# Encoding Environmental Importance- People who say 'Like me' and 'Very Much like me' into 1 and the others in to 0
conditions = [
    (wvs_data['environment_imp'] >= 6) ,
     (wvs_data['environment_imp'] < 6 )]
choices = [1,0]
wvs_data['high_environment_imp'] = np.select(conditions, choices, default=np.nan)

# Encoding Donation to eco orgs
conditions = [
    (wvs_data['donated_eco_org'] ==1) ,
     (wvs_data['donated_eco_org'] >1 )]
choices = [1,0]
wvs_data['donated_eco_org'] = np.select(conditions, choices, default=np.nan)

# Encoding Active member of charitable orgs
conditions = [
    (wvs_data['mem_charitable_orgs'] ==2) ,
     (wvs_data['mem_charitable_orgs'] <2 )]
choices = [1,0]
wvs_data['active_mem_char_orgs'] = np.select(conditions, choices, default=np.nan)

In [None]:
wvs_data.ses.value_counts()

5.0     15553
6.0     11300
4.0      9875
7.0      8808
3.0      8284
1.0      5167
2.0      5013
8.0      4965
9.0      1379
10.0     1081
Name: ses, dtype: int64

In [None]:
# Running a LPM model while subsetting for USA data 
lm1 = smf.ols(formula = 'high_environment_imp ~ ses + age', subset=((wvs_data['country']==840)), data = wvs_data).fit()
print (lm1.summary())

                             OLS Regression Results                             
Dep. Variable:     high_environment_imp   R-squared:                       0.001
Model:                              OLS   Adj. R-squared:                  0.000
Method:                   Least Squares   F-statistic:                     1.362
Date:                  Fri, 02 Apr 2021   Prob (F-statistic):              0.256
Time:                          20:33:33   Log-Likelihood:                -761.13
No. Observations:                  2157   AIC:                             1528.
Df Residuals:                      2154   BIC:                             1545.
Df Model:                             2                                         
Covariance Type:              nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1818      0.029