In [1]:
import matplotlib.pyplot as plt
from math import exp
from scipy.stats import norm
from scipy import stats
import seaborn as sns
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
plt.rcParams["figure.figsize"] = [10,5]
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('unix_pny/ny_pop.csv')

In [3]:
def paul_pipeline(df):
    df['CIT_STATUS'] = np.where(df['CIT'] != 5, 1, 0)
    df['BORN_CIT'] = np.where((df['CITWP'].isna()) & ((df['CIT'] != 4) & (df['CIT'] != 5)), 1, 0)
    conditions = [ (df['CIT_STATUS'] == 1),(df['CIT_STATUS'] == 0)]
    choices = [0,1]
    df['CITWP'] = np.select(conditions, choices, df['CITWP'] )
    conditions = [(df['AGEP'] < 15),((df['AGEP'] >= 15) & (df['COW'].isna()))]
    choices = [0,10]
    df['COW'] = np.select(conditions, choices, df['COW'] )
    conditions = [ (df['AGEP'] < 5)]
    choices = [0]
    df['DDRS'] = np.select(conditions, choices, df['DDRS'] )
    conditions = [ (df['AGEP'] < 5)]
    choices = [0]
    df['DPHY'] = np.select(conditions, choices, df['DPHY'] )
    conditions = [ (df['AGEP'] < 5)]
    choices = [0]
    df['DREM'] = np.select(conditions, choices, df['DREM'] )
    df['ENG'] = np.where(df['AGEP'] < 5, 1, df['ENG'])
    df['ENG'] = np.where(df['ENG'].isna(), 5, df['ENG'])
    conditions = [ (df['ESR'] == 3) & (df['JWMNP'].isna()),df['JWMNP'].isna()]
    choices = [0,201]
    df['JWMNP'] = np.select(conditions, choices, df['JWMNP'] )
    df['JWRIP'] = np.where(df['JWRIP'].isna(), 0, df['JWRIP'] )
    conditions = [ df['AGEP'] < 15,(df['ESR'] == 3) & (df['JWMNP'].isna())]
    choices = [0,13]
    df['JWTR'] = np.select(conditions, choices, df['JWTR'] )
    df['JWTR'] = np.where(df['JWTR'].isna(), 14, df['JWTR'] )
    df['MARHD'] = np.where(df['AGEP'] < 15, 0, df['MARHD'] )
    df['MARHD'] = np.where(df['MARHD'].isna(), 3, df['MARHD'] )
    df['MARHM'] = np.where(df['AGEP'] < 15, 0, df['MARHM'] )
    df['MARHM'] = np.where(df['MARHM'].isna(), 3, df['MARHM'] )
    df['MARHT'] = np.where(df['AGEP'] < 15, 0, df['MARHT'] )
    df['MARHT'] = np.where(df['MARHD'].isna(), 4, df['MARHD'] )
    df['MARHYP'] = np.where(df['AGEP'] < 15, 0, df['MARHYP'] )
    df['MARHYP'] = np.where(df['MARHYP'].isna(), 1, df['MARHYP'] )
    df['MIG'] = np.where(df['AGEP'] < 1, 0, df['MIG'] )
    conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
    choices = [2,3,4]
    df['MLPCD'] = np.select(conditions, choices, df['MLPCD'] )
    conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
    choices = [2,3,4]
    df['MLPE'] = np.select(conditions, choices, df['MLPE'] )
    conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
    choices = [2,3,4]
    df['MLPFG'] = np.select(conditions, choices, df['MLPFG'] )
    conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
    choices = [2,3,4]
    df['MLPH'] = np.select(conditions, choices, df['MLPH'] )
    conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
    choices = [2,3,4]
    df['MLPI'] = np.select(conditions, choices, df['MLPI'] )
    conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
    choices = [2,3,4]
    df['MLPJ'] = np.select(conditions, choices, df['MLPJ'] )
    conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
    choices = [2,3,4]
    df['MLPK'] = np.select(conditions, choices, df['MLPK'] )
    conditions = [ df['AGEP'] < 16,(df['ESR'] != 3) & (df['ESR'].notna())]
    choices = [0,4]
    df['NWLK'] = np.select(conditions, choices, df['NWLK'] )
    df['NWLK'] = np.where(df['NWLK'].isna(), 5, df['NWLK'] )
    df['SCH'] = np.where(df['SCH'].isna(),0,df['SCH'])
    df['DECADE'] = np.where(df['BORN_CIT'] == 1, 0, df['DECADE'])
    df['ESP'] = np.where(df['ESP'].isna(),0,df['ESP'])
    conditions = [ df['ESR'] ==3,(df['ESR'] != 3) & (df['ESR'].notna()) & (df['ESR'] != 6)]
    choices = [0,4]
    df['JWDP'] = np.select(conditions, choices, df['JWDP'] )
    df['JWDP'] = np.where(df['JWDP'].isna(),0,df['JWDP'])
    df['MIGSP'] = np.where(df['AGEP'] < 1,0,df['ST'])
    df['MSP'] = np.where(df['AGEP'] < 15,0,df['MSP'])
    df['NOP'] = np.where(df['NOP'].isna(),0,df['NOP'])
    df['OC'] = np.where(df['OC'].isna(),0,df['OC'])
    df['OCCP'] = np.where(df['AGEP'] < 16,0,df['OCCP'])
    df['OCCP'] = np.where(df['OCCP'].isna(),1,df['OCCP'])
    df['RC'] = np.where(df['RC'].isna(),3,df['RC'])
    df['SCIENGP'] = np.where(df['SCIENGP'].isna(),0,df['SCIENGP'])
    df['VPS'] = np.where(df['AGEP'] < 17,0,df['VPS'])
    df['VPS'] = np.where((df['MIL'] !=1) |(df['MIL'] !=2) ,16,df['VPS'])
    
    return df
    

In [5]:
df['CIT_STATUS'] = np.where(df['CIT'] != 5, 1, 0)
df['BORN_CIT'] = np.where((df['CITWP'].isna()) & ((df['CIT'] != 4) & (df['CIT'] != 5)), 1, 0)
conditions = [ (df['CIT_STATUS'] == 1),(df['CIT_STATUS'] == 0)]
choices = [0,1]
df['CITWP'] = np.select(conditions, choices, df['CITWP'] )
conditions = [(df['AGEP'] < 15),((df['AGEP'] >= 15) & (df['COW'].isna()))]
choices = [0,10]
df['COW'] = np.select(conditions, choices, df['COW'] )
conditions = [ (df['AGEP'] < 5)]
choices = [0]
df['DDRS'] = np.select(conditions, choices, df['DDRS'] )
conditions = [ (df['AGEP'] < 5)]
choices = [0]
df['DPHY'] = np.select(conditions, choices, df['DPHY'] )
conditions = [ (df['AGEP'] < 5)]
choices = [0]
df['DREM'] = np.select(conditions, choices, df['DREM'] )
df['ENG'] = np.where(df['AGEP'] < 5, 1, df['ENG'])
df['ENG'] = np.where(df['ENG'].isna(), 5, df['ENG'])
conditions = [ (df['ESR'] == 3) & (df['JWMNP'].isna()),df['JWMNP'].isna()]
choices = [0,201]
df['JWMNP'] = np.select(conditions, choices, df['JWMNP'] )
df['JWRIP'] = np.where(df['JWRIP'].isna(), 0, df['JWRIP'] )
conditions = [ df['AGEP'] < 15,(df['ESR'] == 3) & (df['JWMNP'].isna())]
choices = [0,13]
df['JWTR'] = np.select(conditions, choices, df['JWTR'] )
df['JWTR'] = np.where(df['JWTR'].isna(), 14, df['JWTR'] )
df['MARHD'] = np.where(df['AGEP'] < 15, 0, df['MARHD'] )
df['MARHD'] = np.where(df['MARHD'].isna(), 3, df['MARHD'] )
df['MARHM'] = np.where(df['AGEP'] < 15, 0, df['MARHM'] )
df['MARHM'] = np.where(df['MARHM'].isna(), 3, df['MARHM'] )
df['MARHT'] = np.where(df['AGEP'] < 15, 0, df['MARHT'] )
df['MARHT'] = np.where(df['MARHD'].isna(), 4, df['MARHD'] )
df['MARHYP'] = np.where(df['AGEP'] < 15, 0, df['MARHYP'] )
df['MARHYP'] = np.where(df['MARHYP'].isna(), 1, df['MARHYP'] )
df['MIG'] = np.where(df['AGEP'] < 1, 0, df['MIG'] )
conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
choices = [2,3,4]
df['MLPCD'] = np.select(conditions, choices, df['MLPCD'] )
conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
choices = [2,3,4]
df['MLPE'] = np.select(conditions, choices, df['MLPE'] )
conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
choices = [2,3,4]
df['MLPFG'] = np.select(conditions, choices, df['MLPFG'] )
conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
choices = [2,3,4]
df['MLPH'] = np.select(conditions, choices, df['MLPH'] )
conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
choices = [2,3,4]
df['MLPI'] = np.select(conditions, choices, df['MLPI'] )
conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
choices = [2,3,4]
df['MLPJ'] = np.select(conditions, choices, df['MLPJ'] )
conditions = [ df['AGEP'] < 17,(df['MIL'] == 4),(df['MIL'] == 3)]
choices = [2,3,4]
df['MLPK'] = np.select(conditions, choices, df['MLPK'] )
conditions = [ df['AGEP'] < 16,(df['ESR'] != 3) & (df['ESR'].notna())]
choices = [0,4]
df['NWLK'] = np.select(conditions, choices, df['NWLK'] )
df['NWLK'] = np.where(df['NWLK'].isna(), 5, df['NWLK'] )
df['SCH'] = np.where(df['SCH'].isna(),0,df['SCH'])
df['DECADE'] = np.where(df['BORN_CIT'] == 1, 0, df['DECADE'])
df['ESP'] = np.where(df['ESP'].isna(),0,df['ESP'])
conditions = [ df['ESR'] ==3,(df['ESR'] != 3) & (df['ESR'].notna()) & (df['ESR'] != 6)]
choices = [0,4]
df['JWDP'] = np.select(conditions, choices, df['JWDP'] )
df['JWDP'] = np.where(df['JWDP'].isna(),0,df['JWDP'])
df['MIGSP'] = np.where(df['AGEP'] < 1,0,df['ST'])
df['MSP'] = np.where(df['AGEP'] < 15,0,df['MSP'])
df['NOP'] = np.where(df['NOP'].isna(),0,df['NOP'])
df['OC'] = np.where(df['OC'].isna(),0,df['OC'])
df['OCCP'] = np.where(df['AGEP'] < 16,0,df['OCCP'])
df['OCCP'] = np.where(df['OCCP'].isna(),1,df['OCCP'])
df['RC'] = np.where(df['RC'].isna(),3,df['RC'])
df['SCIENGP'] = np.where(df['SCIENGP'].isna(),0,df['SCIENGP'])
df['VPS'] = np.where(df['AGEP'] < 17,0,df['VPS'])
df['VPS'] = np.where((df['MIL'] !=1) |(df['MIL'] !=2) ,16,df['VPS'])