In [None]:
import os
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
import seaborn as sns
pd.options.mode.chained_assignment = None

#read in data
df4 = pd.read_stata('GSS2004.dta', convert_categoricals=False, convert_missing=False)
df4

In [None]:
#get rid of no_responses
df4 = df4[(df4['rincome'].notna()) & (df4['spkhome1'].notna()) & (df4['born'].notna()) & (df4['parborn'].notna()) & (df4['granborn'].notna()) & (df4['educ1'].notna()) & (df4['citizen'].notna())]
df4.describe()

### INCOME

In [None]:
df4['rincom98'].describe()

### IMMIGRANT
Born in US(0), Born outside US(1)

In [None]:
#create dummy variable for immigrant/non_immigrant
df4['imm'] = np.where(df4['born'] == 1, 0, 1)
df4['imm'].describe()

In [None]:
sns.histplot(df4.imm)

In [None]:
#income and immigration status regression
imm_income = sm.ols(formula="rincom98 ~ imm", data=df4).fit()
imm_income.summary()

**Little correlation between immigration status and personal income**

In [None]:
sns.lmplot(x="imm", y="rincom98", data=df4, x_estimator=np.mean).set(xlabel='immigrant', ylabel='income')

In [None]:
sns.boxplot(x="imm", y="rincom98", data=df4).set(xlabel='immigrant', ylabel='income')

In [None]:
sns.boxplot(x="day", y="tip", data=tips, capsize=.2)


### GENERATION: 
First_gen(1), Second_gen(2), Native(3)

In [None]:
#function to sort population into "generations"
def label_gen(row):
    if (row['born'] == 2) & ((row['parborn'] == 1) or (row['parborn'] == 2) or (row['parborn'] == 3) or (row['parborn'] == 8)):
        return 1
    if (row['born'] == 1) & ((row['parborn'] == 1) or (row['parborn'] == 2) or (row['parborn'] == 3) or (row['parborn'] == 8)):
        return 2
    if (row['born'] == 1) & (row['parborn'] == 0):
        return 3
    return 3

In [None]:
#applying sorting function
df4['gen'] = df4.apply(lambda row : label_gen(row), axis=1).astype(int)
sns.histplot(df4.gen)

In [None]:
imm_income = sm.ols(formula="rincom98 ~ gen", data=df4).fit()
imm_income.summary()

In [None]:
sns.lmplot(x="gen", y="rincom98", data=df4, x_estimator=np.mean).set(xlabel='generation', ylabel='income')


### ENGLISH:
Speak english at home(1), Speak other at home(0)

In [None]:
#create dummy variable for engl/non_engl
#df4.loc[df4.spkhome1 == 1] 'engl']
df4['engl'] = np.where(df4['spkhome1'] == 1, 1, 0)
df4['engl'].describe()

In [None]:
sns.histplot(df4.engl)

In [None]:
#run linear regression on personal income ~ engl/non_engl 
income_lang = sm.ols(formula="rincom98 ~ engl", data=df4).fit()
income_lang.summary()

In [None]:
#plot of english usage at home and personal income
sns.lmplot(x='engl', y='rincom98', data=df4, x_estimator=np.mean).set(xlabel='english', ylabel='income')

In [None]:
sns.boxplot(x='engl', y='rincom98', data=df4).set(xlabel='english', ylabel='income')

**Some correlation between english_at_home and personal income**

In [None]:
imm_lang = sm.ols(formula="engl ~ imm", data=df4).fit()
imm_lang.summary()

In [None]:
sns.lmplot(x='engl', y = 'imm', data=df4, x_estimator=np.mean).set(xlabel='english', ylabel='immigrant')

In [None]:
sns.boxplot(x='imm', y='engl')

In [None]:
engl_spkrs = df4[df4['engl'] != 1]
engl_spkrs.describe()

### CITIZENSHIP:
Non-Citizen(0), Citizen(1)

In [None]:
#create dummy variable for citizen/non_citizen
#df4.loc[df4.citizen ==1, 'citzn']

df4['citzn'] = (df4['citizen'] == 1).astype(int)
df4['citzn'].describe()

In [None]:
sns.histplot(df4.citzn)

In [None]:
citizens = df4[df4['citzn'] == 1]
citizens.describe()

In [None]:
#regression of dummy variable citizen and personal income
cit_income = sm.ols(formula='rincom98 ~ citzn', data=df4).fit()
cit_income.summary()

In [None]:
#plot of dummy variable citizen and personal income
sns.lmplot(x='citzn', y='rincom98', data=df4, x_estimator=np.mean).set(xlabel='citizen', ylabel='income')

In [None]:
sns.boxplot(x='citzn',y='rincom98', data=df4).set(xlabel='citizen', ylabel='income')

In [None]:
#correlation between citizenship and immigration status
citzn_imm = sm.ols(formula="citzn ~ imm", data=df4).fit()
citzn_imm.summary()

In [None]:
#linear plot of correlation between immigrant and citizenship status
sns.lmplot(x="citzn", y = "imm", data=df4, x_estimator=np.mean).set(xlabel='immigrant', ylabel='citizen')

In [None]:
imm_group = df4[df4['imm']==1]


### EDUCATION:
Educational attainment by years in school

In [None]:
df4['educ'].describe()

In [None]:
#distribution of educational attaintment
sns.histplot(df4.educ).set(xlabel='education')

In [None]:
educ_income = sm.ols(formula='rincom98 ~ educ', data=df4).fit()
educ_income.summary()

In [None]:
#education and income linear plot
sns.lmplot(x="educ", y="rincom98", data=df4, x_estimator=np.mean).set(xlabel='education', ylabel='income')

In [None]:
#education and immigrant status linear plot
sns.lmplot(x="imm", y="educ", data=df4, x_estimator=np.mean).set(xlabel='immigrant', ylabel='education')

In [None]:
#education and income regression for english speakers
engl_spkrs = df4[df4['spkhome1'] == 1]
engl_edu_incm = sm.ols(formula='rincom98 ~ educ', data=engl_spkrs).fit()
engl_edu_incm.summary()

In [None]:
#plot of education and income for english speakers
sns.lmplot(x='educ', y='rincom98', data=engl_spkrs, x_estimator=np.mean).set(xlabel='Education(English=1)', ylabel='income')

In [None]:
#education and personal income regression for non english speakers
non_engl_spkrs = df4[df4['spkhome1'] != 1]
sns.histplot()

In [None]:
#plot of education and income for non english speakers
sns.lmplot(x='educ', y='rincom98', data=non_engl_spkrs, x_estimator=np.mean)

### MAIN REGRESSION: 
Income as a function of Education, English, Citizen, Immigrant

In [None]:
#regression with on income with 4 variables
regr = sm.ols(formula="rincom98 ~ educ + engl + citzn + imm", data=df4).fit()
regr.summary()

In [None]:
## Isolate bottom quartile of population
df5 = df4[df4['rincom98'] <= 11]
df5['citzn'].describe()

In [None]:
df6 = df4[df4['rincom98'] >= 19]
df6['citzn'].describe()

In [None]:
imm = df4[df4['imm'] == 1]
non_imm = df4[df4['imm'] != 1]

In [None]:
non_imm.describe()

In [None]:
#income bracket distribution - immigrants v. natives
sns.histplot(data=df4, x="rincom98", hue = "imm", binwidth=1, stat="density", common_norm=False).set(xlabel='income', ylabel='Density')

In [None]:
sns.histplot(data=df4, x="rincom98", hue = "engl", binwidth=1, stat="density", common_norm=False).set(xlabel='income', ylabel='Density')

In [None]:
#educational attainment distribution - immigrants v. natives
sns.histplot(data=df4, x="educ", hue = "imm", binwidth=1, stat="density", common_norm=False).set(xlabel='education', ylabel='Density')

In [None]:
#income bracket distribution - citizens v. non-citizens
sns.histplot(data=df4, x="rincom98", hue = "citzn", binwidth=1, stat="density", common_norm=False).set(xlabel='income', ylabel='Density')