In [1]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from lec_utils import * 
# from save_data import * 

# Your Title Here

**Name(s)**: Pheobe Yi and Omkar nayak 

**Website Link**: In progress 

## Step 1: Introduction

In [None]:
base_path = Path("/Users/yipho/eecs398/portfolio/datanew")
output_path = Path("/Users/yipho/eecs398/portfolio/rawdata") 
output_path.mkdir(parents=True, exist_ok=True) 

years_to_process = range(2000, 2024)

def rename_case_id_to_respondent_id(df):
    if "CASEID" in df.columns:
        df.rename(columns={"CASEID": "RESPONDENT_ID"}, inplace=True)
    return df

def load_and_save_icpsr_data(base_path, output_path, years_to_process):
    for year in years_to_process:
        print(f"Processing year {year}...")

        icpsr_folders = list(base_path.glob(f"ICPSR_*{year}")) 
        print(icpsr_folders)
        
        if not icpsr_folders:
            print(f"No ICPSR folder found for year {year}. Skipping...")
            continue

        merged_data= []  

        for folder in icpsr_folders:
            study_number = folder.name.split("_")[1].split("-")[0]  

            form1_path = folder / f"DS0001/{study_number}-0001-Data.dta"
            form6_path = folder / f"DS0006/{study_number}-0006-Data.dta"

            # Load and merge Form 1 and Form 6 if both exist
            if form1_path.exists() and form6_path.exists():
                # print(f"Found Form 1 and Form 6 data for year {year}.")
                try:
                    # Load Form 1
                    df1 = pd.read_stata(form1_path)
                    print(f"Loaded Form 1 with shape: {df1.shape}")
                    df1 = rename_case_id_to_respondent_id(df1)

                    # Load Form 6
                    df6 = pd.read_stata(form6_path)
                    print(f"Loaded Form 6 with shape: {df6.shape}")
                    df6 = rename_case_id_to_respondent_id(df6)

  
                    if "RESPONDENT_ID" in df1.columns and "RESPONDENT_ID" in df6.columns:
                        df_merged = df1.merge(df6, on="RESPONDENT_ID", how="inner")
                        print(f"Merged data shape: {df_merged.shape}")


                        df_merged["Year"] = year

                        merged_data.append(df_merged)
                    else:
                        print(f"'RESPONDENT_ID' column missing in Form 1 or Form 6 for year {year}. Skipping merge.")
                except Exception as e:
                    print(f"Error processing Form 1 and Form 6 for {year}: {e}")
            else:
                if not form1_path.exists():
                    print(f"Form 1 data not found for year {year}: {form1_path}")
                if not form6_path.exists():
                    print(f"Form 6 data not found for year {year}: {form6_path}")

        if merged_data:
            year_df = pd.concat(merged_data, axis=0)  
            output_file = output_path / f"ICPSR_data_{year}.csv"
            year_df.to_csv(output_file, index=False)
            print(f"Saved merged data for year {year} to {output_file}")
        else:
            print(f"No merged data found for year {year}.")


load_and_save_icpsr_data(base_path, output_path, years_to_process)
#god bless Kerby Shedden 

Processing year 2000...
[PosixPath('/Users/yipho/eecs398/portfolio/datanew/ICPSR_03184-2000')]
Found Form 1 and Form 6 data for year 2000.
Loaded Form 1 with shape: (13286, 108)
Renamed 'CASEID' to 'RESPONDENT_ID'.
Loaded Form 6 with shape: (2197, 310)
Renamed 'CASEID' to 'RESPONDENT_ID'.
Merged data shape: (2197, 417)
Saved merged data for year 2000 to /Users/yipho/eecs398/portfolio/rawdata/ICPSR_data_2000.csv
Processing year 2001...
[]
No ICPSR folder found for year 2001. Skipping...
Processing year 2002...
[]
No ICPSR folder found for year 2002. Skipping...
Processing year 2003...
[]
No ICPSR folder found for year 2003. Skipping...
Processing year 2004...
[]
No ICPSR folder found for year 2004. Skipping...
Processing year 2005...
[]
No ICPSR folder found for year 2005. Skipping...
Processing year 2006...
[]
No ICPSR folder found for year 2006. Skipping...
Processing year 2007...
[]
No ICPSR folder found for year 2007. Skipping...
Processing year 2008...
[]
No ICPSR folder found for 

## Step 2: Data Cleaning and Exploratory Data Analysis

In [None]:
cols_interest = ['RESPONDENT_ID', 'V1_x', 'V2150','V49_x', 'V2167', 'V2157', 'V2155', 'V2156', 'V5313', 'V5321']
dfmain = dfcat[cols_interest]
dfmain.head()

In [None]:
cols_clean = ['V49_x','V2150', 'V2167', 'V2157', 'V2155', 'V2156', 'V5313', 'V5321']

def extract_number(column):
    pattern = r".+:\s*\((\d+)\)"
    return column.apply(lambda x: int(re.match(pattern, str(x)).group(1)) if re.match(pattern, str(x)) else None)

In [None]:
for col in cols_clean:
    dfmain[col] = extract_number(dfcat[col])

dfmain.head()

In [None]:
#drop 6,8,-9 for 2167 Pol leaning 
dfmain = dfmain[dfmain['V2167'] != 6]
dfmain = dfmain[dfmain['V2167'] != 8]
dfmain = dfmain[dfmain['V2167'] != -9]
#drop nan
dfmain = dfmain.dropna(subset=['V2167'])
#rescale and regularize 
dfmain['V2167'] = dfmain['V2167'] - 1

In [None]:
# for sex, drop -9, 3 and 4 (missing other and refused to answer)
dfmain = dfmain[dfmain['V2150'] != -9]
dfmain = dfmain[dfmain['V2150'] != 4]
dfmain = dfmain[dfmain['V2150'] != 3]

dfmain = dfmain.dropna(subset=['V2150'])

dfmain['V2150'] = dfmain['V2150'] - 1
# 0 for male, 1 for female now 
dfmain['V2150'].value_counts()

In [None]:
pol_blfs = ["Very Conservative", "Conservative", "Moderate", "Liberal", "Very Liberal"]
counts = dfmain['V2167'].value_counts().sort_index()


colors = ['red', 'pink', 'darkgray', 'lightblue', 'blue']


fig = px.bar(
    x=counts.index,
    y=counts.values,
    title='Political Beliefs Distribution',
    labels={'x': 'Political Beliefs', 'y': 'Count'},
    color=counts.index, 
    color_discrete_sequence=colors  
)

# Update x-axis labels
fig.update_layout(
    xaxis_title="Political Beliefs",
    yaxis_title="Count",
    xaxis=dict(
        tickmode='array',
        tickvals=counts.index,
        ticktext=pol_blfs
    )
)
# Show the chart
fig.show()


In [None]:
print("fuck this man why isn't it working")

## Step 3: Framing a Prediction Problem

## Step 4: Baseline Model

In [None]:
# TODO

## Step 5: Final Model

In [None]:
# TODO