In [20]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from lec_utils import * 
# from save_data import * 

# Your Title Here

**Name(s)**: Pheobe Yi and Omkar nayak 

**Website Link**: In progress 

## Step 1: Introduction

In [3]:
base_path = Path("/Users/yipho/eecs398/portfolio/allyears")
output_path = Path("/Users/yipho/eecs398/portfolio/raw_data") 
output_path.mkdir(parents=True, exist_ok=True) 

years_to_process = range(2000, 2024)

def rename_case_id_to_respondent_id(df):
    if "CASEID" in df.columns and "RESPONDENT_ID" not in df.columns:
        df.rename(columns={"CASEID": "RESPONDENT_ID"}, inplace=True)
    return df

def load_and_save_icpsr_data(base_path, output_path, years_to_process):
    for year in years_to_process:
        print(f"Processing year {year}...")

        icpsr_folders = list(base_path.glob(f"ICPSR_*{year}")) 
        print(icpsr_folders)
        
        if not icpsr_folders:
            print(f"No ICPSR folder found for year {year}. Skipping...")
            continue

        merged_data= []  

        for folder in icpsr_folders:
            study_number = folder.name.split("_")[1].split("-")[0]  

            form1_path = folder / f"DS0001/{study_number}-0001-Data.dta"
            form6_path = folder / f"DS0006/{study_number}-0006-Data.dta"

            # Load and merge Form 1 and Form 6 if both exist
            if form1_path.exists() and form6_path.exists():
                # print(f"Found Form 1 and Form 6 data for year {year}.")
                try:
                    # Load Form 1
                    df1 = pd.read_stata(form1_path)
                    print(f"Loaded Form 1 with shape: {df1.shape}")
                    df1 = rename_case_id_to_respondent_id(df1)

                    # Load Form 6
                    df6 = pd.read_stata(form6_path)
                    print(f"Loaded Form 6 with shape: {df6.shape}")
                    df6 = rename_case_id_to_respondent_id(df6)

  
                    if "RESPONDENT_ID" in df1.columns and "RESPONDENT_ID" in df6.columns:
                        df_merged = df1.merge(df6, on="RESPONDENT_ID", how="inner")
                        print(f"Merged data shape: {df_merged.shape}")


                        df_merged["Year"] = year

                        merged_data.append(df_merged)
                    else:
                        print(f"'RESPONDENT_ID' column missing in Form 1 or Form 6 for year {year}. Skipping merge.")
                except Exception as e:
                    print(f"Error processing Form 1 and Form 6 for {year}: {e}")
            else:
                if not form1_path.exists():
                    print(f"Form 1 data not found for year {year}: {form1_path}")
                if not form6_path.exists():
                    print(f"Form 6 data not found for year {year}: {form6_path}")

        if merged_data:
            year_df = pd.concat(merged_data, axis=0)  
            output_file = output_path / f"ICPSR_data_{year}.csv"
            year_df.to_csv(output_file, index=False)
            print(f"Saved merged data for year {year} to {output_file}")
        else:
            print(f"No merged data found for year {year}.")


load_and_save_icpsr_data(base_path, output_path, years_to_process)
#god bless Kerby Shedden 

Processing year 2000...
[PosixPath('/Users/yipho/eecs398/portfolio/allyears/ICPSR_03184-2000')]
Loaded Form 1 with shape: (13286, 108)
Loaded Form 6 with shape: (2197, 310)
Merged data shape: (2197, 417)
Saved merged data for year 2000 to /Users/yipho/eecs398/portfolio/raw_data/ICPSR_data_2000.csv
Processing year 2001...
[PosixPath('/Users/yipho/eecs398/portfolio/allyears/ICPSR_03425-2001')]
Loaded Form 1 with shape: (13304, 108)
Loaded Form 6 with shape: (2215, 311)
Merged data shape: (2215, 418)
Saved merged data for year 2001 to /Users/yipho/eecs398/portfolio/raw_data/ICPSR_data_2001.csv
Processing year 2002...
[PosixPath('/Users/yipho/eecs398/portfolio/allyears/ICPSR_03753-2002')]
Loaded Form 1 with shape: (13544, 108)
Loaded Form 6 with shape: (2257, 312)
Merged data shape: (2257, 419)
Saved merged data for year 2002 to /Users/yipho/eecs398/portfolio/raw_data/ICPSR_data_2002.csv
Processing year 2003...
[PosixPath('/Users/yipho/eecs398/portfolio/allyears/ICPSR_04019-2003')]
Loaded 

## Step 2: Data Cleaning and Exploratory Data Analysis

In [47]:
base_path = Path("/Users/macbook/Desktop/EECS389/398_mtf/raw_data")
output_path = Path("/Users/macbook/Desktop/EECS389/398_mtf/unprocessed_data") 
output_path.mkdir(parents=True, exist_ok=True)

variable_mapping = {
    "POL_BELIEFS": {
        (2000, 2023): "V5167",
    },
    "SEX" : {
        (2000, 2023): "V5150",
    },
    "NUM_SIBS": {
        (2000, 2023): "V49_x",
    }, 
    "BR_SR_inhouse": {
        (2000, 2011): "V157",
        (2012, 2023): "V2157",
    },
    "FATHR_PRES": {
        (2000, 2023): "V5155",
    },
    "MOTHR_PRES": {
        (2000, 2023): "V5156",
    },
    "LONELY": {
        (2000, 2023): "V5313", 
    },
    "WISH_MORE_FRNDS": {
        (2000, 2023): "V5321",  
    },
    "USLLY_FRNDS": {
        (2000, 2023): "V5324",  
    },
}

def get_variable_for_year(variable_name, year):
    for year_range, var in variable_mapping[variable_name].items():
        if year_range[0] <= year <= year_range[1]:
            return var
    return None

def rename_variables(df, year):
    renamed_columns = {}

    for logic_name, year_mapping in variable_mapping.items():
        column_name = get_variable_for_year(logic_name, year)
        if column_name and column_name in df.columns:
            renamed_columns[column_name] = logic_name

    df = df.rename(columns=renamed_columns)
    print(f"Renamed columns for year {year}: {renamed_columns}")
    return df

def clean_and_process_data(df, year):
    df = rename_variables(df, year)

    cols_interest = [
        "RESPONDENT_ID",
        "V1_x", 
        "SEX",  
        "POL_BELIEFS",  
        "NUM_SIBS", 
        "BR_SR_inhouse", 
        "FATHR_PRES",  
        "MOTHR_PRES",  
        "LONELY",  
        "WISH_MORE_FRNDS",  
        "USLLY_FRNDS", 
    ]

    cols_interest = [col for col in cols_interest if col in df.columns]  
    dfmain = df[cols_interest]

    # Clean variables where needed
    cols_clean = ["NUM_SIBS", "SEX", "POL_BELIEFS", "BR_SR_inhouse", "FATHR_PRES", "MOTHR_PRES", "LONELY", "WISH_MORE_FRNDS", "USLLY_FRNDS"]
    cols_clean = [col for col in cols_clean if col in df.columns]


    # def extract_number(column):
    #     pattern = r".+:\s*\((\d+)\)"
    #     return column.apply(lambda x: int(re.match(pattern, str(x)).group(1)) if re.match(pattern, str(x)) else None)
    
    # for col in cols_clean:
    #     dfmain[col] = extract_number(dfmain[col])
    def extract_number(column):
    
        pattern1 = r".+:\s*\((-?\d+)\)"  
        pattern2 = r"(\d+)"            

        def parse_value(value):
            value_str = str(value).strip()
            
            if re.match(pattern1, value_str):
                return int(re.match(pattern1, value_str).group(1))
            elif re.match(pattern2, value_str):
                return int(re.match(pattern2, value_str).group(1))
            else:
                return None

        return column.apply(parse_value)
    
    for col in cols_clean:
        dfmain[col] = extract_number(dfmain[col])
    
  
    if "POL_BELIEFS" in dfmain.columns:
        dfmain = dfmain.dropna(subset=["POL_BELIEFS"])

    # Drop invalid values for SEX
    if "SEX" in dfmain.columns:
        dfmain = dfmain.dropna(subset=["SEX"])

    if 'NUM_SIBS' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['NUM_SIBS'])

    if 'BR_SR_inhouse' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['BR_SR_inhouse'])

    if 'FATHR_PRES' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['FATHR_PRES'])
    
    if 'MOTHR_PRES' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['MOTHR_PRES'])

    #if lonely missing 
    if 'LONELY' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['LONELY'])
    #if wish more friends missing
    if 'WISH_MORE_FRNDS' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['WISH_MORE_FRNDS'])
    #if usually friends missing
    if 'USLLY_FRNDS' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['USLLY_FRNDS'])

    return dfmain

# Process each CSV file based on year
def process_raw_data(base_path, output_path):
    for csv_file in base_path.glob("ICPSR_data_*.csv"):
        try:
            # Extract year from file name
            year = int(csv_file.stem.split("_")[-1])
            # print(f"Processing file for year {year}: {csv_file}")

            # Load data
            df = pd.read_csv(csv_file)
            # print(f"Loaded data with shape: {df.shape}")

            # Process data
            df_processed = clean_and_process_data(df, year)
            # print(f"Processed data shape: {df_processed.shape}")

            # Save processed data
            output_file = output_path / f"data_{year}.csv"
            df_processed.to_csv(output_file, index=False)
            # print(f"Saved processed data for year {year} to {output_file}")

        except Exception as e:
            print(f"Error processing file {csv_file}: {e}")


process_raw_data(base_path, output_path)

Renamed columns for year 2013: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2007: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2006: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2012: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2004: {'V5167': 'POL_BELIEFS', 'V

In [53]:
df23 = pd.read_csv("/unprocessed/data_2023.csv")
df23['POL_BELIEFS'].value_counts()

FileNotFoundError: [Errno 2] No such file or directory: '/unprocessed/data_2023.csv'

In [None]:
base_path = Path("/Users/yipho/eecs398/portfolio/raw_data")
output_path = Path("/Users/yipho/eecs398/portfolio/processed_data") 
output_path.mkdir(parents=True, exist_ok=True)

variable_mapping = {
    "POL_BELIEFS": {
        (2000, 2023): "V5167",
    },
    "SEX" : {
        (2000, 2023): "V5150",
    },
    "NUM_SIBS": {
        (2000, 2023): "V49_x",
    }, 
    "BR_SR_inhouse": {
        (2000, 2011): "V157",
        (2012, 2023): "V2157",
    },
    "FATHR_PRES": {
        (2000, 2023): "V5155",
    },
    "MOTHR_PRES": {
        (2000, 2023): "V5156",
    },
    "LONELY": {
        (2000, 2023): "V5313", 
    },
    "WISH_MORE_FRNDS": {
        (2000, 2023): "V5321",  
    },
    "USLLY_FRNDS": {
        (2000, 2023): "V5324",  
    },
}

def get_variable_for_year(variable_name, year):
    for year_range, var in variable_mapping[variable_name].items():
        if year_range[0] <= year <= year_range[1]:
            return var
    return None

def rename_variables(df, year):
    renamed_columns = {}

    for logic_name, year_mapping in variable_mapping.items():
        column_name = get_variable_for_year(logic_name, year)
        if column_name and column_name in df.columns:
            renamed_columns[column_name] = logic_name

    df = df.rename(columns=renamed_columns)
    print(f"Renamed columns for year {year}: {renamed_columns}")
    return df

def clean_and_process_data(df, year):
    df = rename_variables(df, year)

    cols_interest = [
        "RESPONDENT_ID",
        "V1_x", 
        "SEX",  
        "POL_BELIEFS",  
        "NUM_SIBS", 
        "BR_SR_inhouse", 
        "FATHR_PRES",  
        "MOTHR_PRES",  
        "LONELY",  
        "WISH_MORE_FRNDS",  
        "USLLY_FRNDS", 
    ]

    cols_interest = [col for col in cols_interest if col in df.columns]  
    dfmain = df[cols_interest]

    # Clean variables where needed
    cols_clean = ["NUM_SIBS", "SEX", "POL_BELIEFS", "BR_SR_inhouse", "FATHR_PRES", "MOTHR_PRES", "LONELY", "WISH_MORE_FRNDS", "USLLY_FRNDS"]
    cols_clean = [col for col in cols_clean if col in df.columns]


    # def extract_number(column):
    #     pattern = r".+:\s*\((\d+)\)"
    #     return column.apply(lambda x: int(re.match(pattern, str(x)).group(1)) if re.match(pattern, str(x)) else None)
    
    # for col in cols_clean:
    #     dfmain[col] = extract_number(dfmain[col])
    def extract_number(column):
    
        pattern1 = r".+:\s*\((\d+)\)"  
        pattern2 = r"-?\d+"            

        def parse_value(value):
            value_str = str(value).strip()
            
            if re.match(pattern1, value_str):
                return int(re.match(pattern1, value_str).group(1))
            elif re.match(pattern2, value_str):
                return int(re.match(pattern2, value_str).group(1))
            else:
                return None

        return column.apply(parse_value)
    
    for col in cols_clean:
        dfmain[col] = extract_number(dfmain[col])
    
  
    if "POL_BELIEFS" in dfmain.columns:
        dfmain = dfmain[dfmain["POL_BELIEFS"].isin([6, 8, -9]) == False]
        dfmain = dfmain.dropna(subset=["POL_BELIEFS"])
        dfmain["POL_BELIEFS"] = dfmain["POL_BELIEFS"] - 1  # Rescale to start from 0

    # Drop invalid values for SEX
    if "SEX" in dfmain.columns:
        dfmain = dfmain[dfmain["SEX"].isin([-9, 3, 4]) == False]
        dfmain = dfmain.dropna(subset=["SEX"])
        dfmain["SEX"] = dfmain["SEX"] - 1  # 0 for male, 1 for female

    if 'NUM_SIBS' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['NUM_SIBS'])

    if 'BR_SR_inhouse' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['BR_SR_inhouse'])

    if 'FATHR_PRES' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['FATHR_PRES'])
    
    if 'MOTHR_PRES' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['MOTHR_PRES'])

    #if lonely missing 
    if 'LONELY' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['LONELY'])
        dfmain['LONELY'] = dfmain['LONELY'] - 1
    #if wish more friends missing
    if 'WISH_MORE_FRNDS' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['WISH_MORE_FRNDS'])
        dfmain['WISH_MORE_FRNDS'] = dfmain['WISH_MORE_FRNDS'] - 1
    #if usually friends missing
    if 'USLLY_FRNDS' in dfmain.columns:
        dfmain = dfmain.dropna(subset=['USLLY_FRNDS'])
        dfmain['USLLY_FRNDS'] = dfmain['USLLY_FRNDS'] - 1

    return dfmain

# Process each CSV file based on year
def process_raw_data(base_path, output_path):
    for csv_file in base_path.glob("ICPSR_data_*.csv"):
        try:
            # Extract year from file name
            year = int(csv_file.stem.split("_")[-1])
            # print(f"Processing file for year {year}: {csv_file}")

            # Load data
            df = pd.read_csv(csv_file)
            # print(f"Loaded data with shape: {df.shape}")

            # Process data
            df_processed = clean_and_process_data(df, year)
            # print(f"Processed data shape: {df_processed.shape}")

            # Save processed data
            output_file = output_path / f"data_{year}.csv"
            df_processed.to_csv(output_file, index=False)
            # print(f"Saved processed data for year {year} to {output_file}")

        except Exception as e:
            print(f"Error processing file {csv_file}: {e}")


process_raw_data(base_path, output_path)

Renamed columns for year 2013: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2007: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2006: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2012: {'V5167': 'POL_BELIEFS', 'V5150': 'SEX', 'V49_x': 'NUM_SIBS', 'V2157': 'BR_SR_inhouse', 'V5155': 'FATHR_PRES', 'V5156': 'MOTHR_PRES', 'V5313': 'LONELY', 'V5321': 'WISH_MORE_FRNDS', 'V5324': 'USLLY_FRNDS'}
Renamed columns for year 2004: {'V5167': 'POL_BELIEFS', 'V

In [None]:
base_path = Path("/Users/yipho/eecs398/portfolio/processed_data")
output_path = Path("/Users/yipho/eecs398/portfolio/df_data") 
output_path.mkdir(parents=True, exist_ok=True)

def avg_loneliness(df, year): 
    #scaling from 0 to 1 
    #divide all values by 4
    df['LONELY'] = df['LONELY']/4
    #add all values together then divide by the number of values
    return df['LONELY'].mean()

def avg_pol_beliefs(df, year): 
    #scaling from 0 to 1
    #divide all values by 4
    df['POL_BELIEFS'] = df['POL_BELIEFS']/4
    #add all values together then divide by the number of values
    return df['POL_BELIEFS'].mean()


def avg_sibling_count(df, year): 
    return df["NUM_SIBS"].mean()

def avg_wish_frnds(df,year):
    df['WISH_MORE_FRNDS'] = df['WISH_MORE_FRNDS']/4
    return df['WISH_MORE_FRNDS'].mean()

def avg_uslly_frnds(df,year):
    df['USLLY_FRNDS'] = df['USLLY_FRNDS']/4
    return df['USLLY_FRNDS'].mean()

def avg_fathr_pres(df,year):
    return df['FATHR_PRES'].mean()

def avg_mothr_pres(df,year):
    return df['MOTHR_PRES'].mean()


#boolean to a numeric value for father and pres 


In [25]:
df23 = pd.read_csv("processed_data/data_2023.csv")
pivot_table = pd.crosstab(df23["LONELY"], df23["POL_BELIEFS"])
pivot_table

POL_BELIEFS,0.0,1.0,2.0,3.0,4.0
LONELY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,16,35,32,9,4
1.0,7,28,53,40,17
2.0,17,34,46,45,17
3.0,11,34,39,46,18
4.0,9,21,33,36,9


## SECOND PLOT, BIVAR ANALYSIS 1 

In [26]:
fig = px.density_heatmap(
    df23,
    x='POL_BELIEFS',
    y='LONELY',
    color_continuous_scale='Viridis',
    title="Bivariate Analysis of Political Beliefs and Loneliness (Heatmap Example)",
)
fig.show()
fig.write_html("/Users/macbook/Desktop/EECS389/398_mtf/plot2HM.html", include_plotlyjs='cdn')


FileNotFoundError: [Errno 2] No such file or directory: '/Users/macbook/Desktop/EECS389/398_mtf/plot2HM.html'

## THIRD PLOT, BIVAR ANALYSIS 2 

In [None]:
print(df23["USLLY_FRNDS"].value_counts())
print(df23.loc[df23['SEX'] == 1, 'USLLY_FRNDS'].value_counts())
print(df23.loc[df23['SEX'] == 0, 'USLLY_FRNDS'].value_counts())

USLLY_FRNDS
4.0    264
3.0    247
2.0     68
1.0     43
0.0     34
Name: count, dtype: int64
USLLY_FRNDS
3.0    147
4.0    134
2.0     35
1.0     20
0.0     18
Name: count, dtype: int64
USLLY_FRNDS
4.0    130
3.0    100
2.0     33
1.0     23
0.0     16
Name: count, dtype: int64


In [40]:
fig = px.box(df23, x = "SEX", y = "USLLY_FRNDS", title = "Boxplot", labels = {"SEX": "Sex", "USLLY_FRNDS": "Consistent Group of Friends?"})
fig.show()

In [None]:
fig.write_html("/Users/macbook/Desktop/EECS389/398_mtf/plot3BS.html", include_plotlyjs='cdn')

# Interesting Aggregates! 

In [None]:

pivot_table = pd.crosstab(df23['LONELY'], df23['WISH_MORE_FRNDS'])
pivot_table


WISH_MORE_FRNDS,0.0,1.0,2.0,3.0,4.0
LONELY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,48,18,12,10,8
1.0,37,45,14,32,17
2.0,23,29,48,39,20
3.0,21,28,15,50,34
4.0,12,11,7,25,53


## Step 3: Framing a Prediction Problem

## Step 4: Baseline Model

In [21]:
# import all the necessary tools 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics  

### Building the baseline Model

In [None]:
# TODO
def getresbasic(year):
    # 1. Load the processed data for all years
    try:
        dfpred = pd.read_csv(f"processed_data/data_{year}.csv")
    except:
        print(f"Error loading data for year {year}")
        return None
    
    # get the right data that we need 
    goal = dfpred["POL_BELIEFS"]
    Pred = dfpred[["BR_SR_inhouse","LONELY","WISH_MORE_FRNDS"]]


    # 2. Split the data
    X_train, X_test, y_train, y_test = train_test_split(Pred,goal , random_state=100,  test_size=0.20, shuffle=True) 
    
    # 3. Train the model
    rf = RandomForestClassifier(n_estimators = 100) 
    rf.fit(X_train, y_train)
    
    # 4. Predict stuff
    y_pred = rf.predict(X_test)
    
    # 5. Evaluate the model
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("F1 Score:",metrics.f1_score(y_test, y_pred, average='weighted'))
    
    # Return nothing because we won't use anything form this model 
    return None 
    

## Step 5: Final Model

In [54]:
#Build the Final Model 


def getresfinal(year):
    # try to load in the cleaned data
    path =  f'/Users/macbook/Desktop/EECS389/398_mtf/unprocessed_data/data_{year}.csv'
    try:
        dfpred = pd.read_csv(path)
    except:
        print(f"Error loading data for year {year}")
        return None
    
    #subset all the invalid POL_BELIEFS data 
    
    dfprednew =  dfpred[dfpred["POL_BELIEFS"].isin([6, 8, -9]) == True]
    dfpred = dfpred[dfpred["POL_BELIEFS"].isin([6, 8, -9]) == False]


    # Update the columns to be used in the model
    #Parents Collumn 
    def make_parents(df):
        df["PARENTS_PRES"] = df["MOTHR_PRES"] + df["FATHR_PRES"]
        df = df.drop(columns=["MOTHR_PRES", "FATHR_PRES"])
        return df   
    
    dfpred = make_parents(dfpred)

    maingoal = dfpred["POL_BELIEFS"]
    Pred = dfpred[["BR_SR_inhouse","LONELY","WISH_MORE_FRNDS",
                   "USLLY_FRNDS","NUM_SIBS","PARENTS_PRES","SEX"]]
    

    # 2. Split the data
    X_train, X_test, y_train, y_test = train_test_split(Pred, maingoal, random_state=100, test_size=0.20, shuffle=True) 
    

    # Set Grid Search Parameters
    
    param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, 40, 50]}
    rf = RandomForestClassifier(random_state=78)
    
    # 3. Train the model
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=10, scoring='accuracy') 
    grid_search.fit(X_train, y_train)
    
    best_rf = grid_search.best_estimator_
    best_rf.fit(Pred, maingoal)
    
    y_pred = best_rf.predict(X_test)
    

    Accuracy = metrics.accuracy_score(y_test, y_pred)
    F1 = metrics.f1_score(y_test, y_pred, average='weighted')
    
    print("Accuracy:", Accuracy)
    print("F1 Score:", F1)
    

    # Predict for the invalid data
    dfprednew = make_parents(dfprednew)
    Pred = dfprednew[["BR_SR_inhouse","LONELY","WISH_MORE_FRNDS",
                   "USLLY_FRNDS","NUM_SIBS","PARENTS_PRES","SEX"]]
    
    y_pred = best_rf.predict(Pred)
    # Add the new y_pred to the original goal 
    Total = pd.concat([maingoal, pd.Series(y_pred)], axis=0)

    # Find the avg total    
    return Total.mean(), year, Accuracy, F1

In [35]:
mean = getresfinal(2023)

Accuracy: 0.7832167832167832
F1 Score: 0.7813237317843981
3    440
4    337
2    281
5    114
1    109
Name: count, dtype: int64


In [36]:
mean

3.0515222482435598

In [58]:
meanarr = []
yeararr = []
Accarr = []
F1arr = []

for year in range(2000, 2024):
    mean, year, Acc, F1 = getresfinal(year)
    meanarr.append(mean)
    yeararr.append(year)
    Accarr.append(Acc)
    F1arr.append(F1)

meanarr = np.array(meanarr)
yeararr = np.array(yeararr)
Accarr = np.array(Accarr)   
F1arr = np.array(F1arr)


meanarr = (meanarr-1)/5


FinalRes = pd.DataFrame({"Year": yeararr, "Mean": meanarr, "Accuracy": Accarr, "F1": F1arr})

Accuracy: 0.7718446601941747
F1 Score: 0.7546057380599628
Accuracy: 0.7763157894736842
F1 Score: 0.7685274950720324
Accuracy: 0.7824074074074074
F1 Score: 0.7744711289091202
Accuracy: 0.7093023255813954
F1 Score: 0.6899018930215167
Accuracy: 0.7132352941176471
F1 Score: 0.6937923739100481
Accuracy: 0.6610169491525424
F1 Score: 0.6492236221130263
Accuracy: 0.7269372693726938
F1 Score: 0.7200514516712876
Accuracy: 0.660377358490566
F1 Score: 0.635968047335692
Accuracy: 0.7441860465116279
F1 Score: 0.7313797933948777
Accuracy: 0.7023411371237458
F1 Score: 0.6903098837009995
Accuracy: 0.7252747252747253
F1 Score: 0.7193170385563132
Accuracy: 0.7619047619047619
F1 Score: 0.7542132362644147
Accuracy: 0.7666666666666667
F1 Score: 0.7569512163288253
Accuracy: 0.8098159509202454
F1 Score: 0.8050237142603105
Accuracy: 0.7337662337662337
F1 Score: 0.7318459294471293
Accuracy: 0.7307692307692307
F1 Score: 0.7320196120895421
Accuracy: 0.7707006369426752
F1 Score: 0.7693731193104693
Accuracy: 0.7630

In [62]:
reFinRes = FinalRes.reset_index().set_index("Year")
reFinRes

Unnamed: 0_level_0,index,Mean,Accuracy,F1
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000,0,0.41,0.77,0.75
2001,1,0.41,0.78,0.77
2002,2,0.43,0.78,0.77
...,...,...,...,...
2021,21,0.39,0.76,0.76
2022,22,0.45,0.78,0.78
2023,23,0.41,0.78,0.78


In [None]:
fig = px.line(reFinRes, x = reFinRes.index, y = "Mean", title = "Mean of Political Beliefs Over Time",
              labels = {"Mean": "Mean of Political Beliefs"})
fig.update_yaxes(range=[2, 4])
fig.update_yaxes(tickvals=[2, 3, 4], ticktext=["Conservative", "Moderate", "Liberal"])
fig.show()

# gotta split by sex 
