In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from lec_utils import * 
# from save_data import * 

# Your Title Here

**Name(s)**: Pheobe Yi and Omkar nayak 

**Website Link**: In progress 

## Step 1: Introduction

In [8]:
base_path = Path("/Users/yipho/eecs398/portfolio/datanew")
output_path = Path("/Users/yipho/eecs398/portfolio/rawdata") 
output_path.mkdir(parents=True, exist_ok=True) 

years_to_process = range(2000, 2024)

def load_and_save_icpsr_data(base_path, output_path, years_to_process):
    for year in years_to_process:
        print(f"Processing year {year}...")

        icpsr_folders = list(base_path.glob(f"ICPSR_*{year}")) 
        print(icpsr_folders)
        
        if not icpsr_folders:
            print(f"No ICPSR folder found for year {year}. Skipping...")
            continue

        year_data = []  

        for folder in icpsr_folders:
            study_number = folder.name.split("_")[1].split("-")[0]  

            form1_path = folder / f"DS0001/{study_number}-0001-Data.dta"
            form6_path = folder / f"DS0006/{study_number}-0006-Data.dta"

            for form_path, form_name in zip([form1_path, form6_path], ["Form 1", "Form 5"]):
                if form_path.exists():
                    print(f"Found {form_name} data for year {year}: {form_path}")
                    try:
                        df = pd.read_stata(form_path)

                        df["Year"] = year
                        df["Form"] = form_name

                        year_data.append(df)
                    except Exception as e:
                        print(f"Error loading {form_path}: {e}")
                else:
                    print(f"{form_name} data not found for year {year}.")


        if year_data:
            year_df = pd.concat(year_data, axis=0)
            output_file = output_path / f"ICPSR_data_{year}.csv"
            year_df.to_csv(output_file, index=False)
            print(f"Saved data for year {year} to {output_file}")
        else:
            print(f"No data found for year {year}.")

load_and_save_icpsr_data(base_path, output_path, years_to_process)

Processing year 2000...
[PosixPath('/Users/yipho/eecs398/portfolio/datanew/ICPSR_03184-2000')]
Found Form 1 data for year 2000: /Users/yipho/eecs398/portfolio/datanew/ICPSR_03184-2000/DS0001/03184-0001-Data.dta
Found Form 5 data for year 2000: /Users/yipho/eecs398/portfolio/datanew/ICPSR_03184-2000/DS0006/03184-0006-Data.dta
Saved data for year 2000 to /Users/yipho/eecs398/portfolio/rawdata/ICPSR_data_2000.csv
Processing year 2001...
[]
No ICPSR folder found for year 2001. Skipping...
Processing year 2002...
[]
No ICPSR folder found for year 2002. Skipping...
Processing year 2003...
[]
No ICPSR folder found for year 2003. Skipping...
Processing year 2004...
[]
No ICPSR folder found for year 2004. Skipping...
Processing year 2005...
[]
No ICPSR folder found for year 2005. Skipping...
Processing year 2006...
[]
No ICPSR folder found for year 2006. Skipping...
Processing year 2007...
[]
No ICPSR folder found for year 2007. Skipping...
Processing year 2008...
[]
No ICPSR folder found for y

In [3]:
# read in the dataset and perform the basic understanding of the dataset
df1 =  pd.read_stata("datanew/ICPSR_39172-2023/DS0001/39172-0001-Data.dta")
df6 =  pd.read_stata("datanew/ICPSR_39172-2023/DS0006/39172-0006-Data.dta")

In [4]:
# concat stuff together
dfcat = df1.merge(df6, on='RESPONDENT_ID', how='inner')


## Step 2: Data Cleaning and Exploratory Data Analysis

In [5]:
cols_interest = ['RESPONDENT_ID', 'V1_x', 'V2150','V49_x', 'V2167', 'V2157', 'V2155', 'V2156', 'V5313', 'V5321']
dfmain = dfcat[cols_interest]
dfmain.head()

Unnamed: 0,RESPONDENT_ID,V1_x,V2150,V49_x,...,V2155,V2156,V5313,V5321
0,50001,2023,FEMALE:(2),THREE+:(3),...,MARKED:(1),MARKED:(1),AGREE:(5),AGREE:(5)
1,50002,2023,FEMALE:(2),THREE+:(3),...,NT MARKD:(0),MARKED:(1),AGREE:(5),MOST AGR:(4)
2,50003,2023,MALE:(1),TWO:(2),...,NT MARKD:(0),MARKED:(1),AGREE:(5),AGREE:(5)
3,50004,2023,MALE:(1),TWO:(2),...,MARKED:(1),MARKED:(1),MOST AGR:(4),DISAGREE:(1)
4,50005,2023,MALE:(1),THREE+:(3),...,MARKED:(1),MARKED:(1),NEITHER:(3),DISAGREE:(1)


In [6]:
cols_clean = ['V49_x','V2150', 'V2167', 'V2157', 'V2155', 'V2156', 'V5313', 'V5321']

def extract_number(column):
    pattern = r".+:\s*\((\d+)\)"
    return column.apply(lambda x: int(re.match(pattern, str(x)).group(1)) if re.match(pattern, str(x)) else None)

In [7]:
for col in cols_clean:
    dfmain[col] = extract_number(dfcat[col])

dfmain.head()

Unnamed: 0,RESPONDENT_ID,V1_x,V2150,V49_x,...,V2155,V2156,V5313,V5321
0,50001,2023,2.0,3.0,...,1.0,1.0,5.0,5.0
1,50002,2023,2.0,3.0,...,0.0,1.0,5.0,4.0
2,50003,2023,1.0,2.0,...,0.0,1.0,5.0,5.0
3,50004,2023,1.0,2.0,...,1.0,1.0,4.0,1.0
4,50005,2023,1.0,3.0,...,1.0,1.0,3.0,1.0


In [8]:
#drop 6,8,-9 for 2167 Pol leaning 
dfmain = dfmain[dfmain['V2167'] != 6]
dfmain = dfmain[dfmain['V2167'] != 8]
dfmain = dfmain[dfmain['V2167'] != -9]
#drop nan
dfmain = dfmain.dropna(subset=['V2167'])
#rescale and regularize 
dfmain['V2167'] = dfmain['V2167'] - 1

In [9]:
# for sex, drop -9, 3 and 4 (missing other and refused to answer)
dfmain = dfmain[dfmain['V2150'] != -9]
dfmain = dfmain[dfmain['V2150'] != 4]
dfmain = dfmain[dfmain['V2150'] != 3]

dfmain = dfmain.dropna(subset=['V2150'])

dfmain['V2150'] = dfmain['V2150'] - 1
# 0 for male, 1 for female now 
dfmain['V2150'].value_counts()

V2150
1.0    370
0.0    319
Name: count, dtype: int64

In [10]:
pol_blfs = ["Very Conservative", "Conservative", "Moderate", "Liberal", "Very Liberal"]
counts = dfmain['V2167'].value_counts().sort_index()


colors = ['red', 'pink', 'darkgray', 'lightblue', 'blue']


fig = px.bar(
    x=counts.index,
    y=counts.values,
    title='Political Beliefs Distribution',
    labels={'x': 'Political Beliefs', 'y': 'Count'},
    color=counts.index, 
    color_discrete_sequence=colors  
)

# Update x-axis labels
fig.update_layout(
    xaxis_title="Political Beliefs",
    yaxis_title="Count",
    xaxis=dict(
        tickmode='array',
        tickvals=counts.index,
        ticktext=pol_blfs
    )
)
# Show the chart
fig.show()


In [11]:
print("fuck this man why isn't it working")

fuck this man why isn't it working


## Step 3: Framing a Prediction Problem

## Step 4: Baseline Model

In [12]:
# TODO

## Step 5: Final Model

In [13]:
# TODO