# Data Processing #

In [26]:
#hardcoded vals for testing
features = ["GDP per capita", "Social support",
                "Healthy life expectancy", "Freedom to make life choices", "Generosity", "Perceptions of corruption"]
target = "Score"
title = "testing testing"
testSize = 0.1
randomState = 4 # for reproducible output
numFolds = 10

In [27]:
# create pandas dataframe
import pandas as pd

data = pd.read_csv("caraml/caraml/media/happiness.csv")

X = data.loc[:,features]
y = data.loc[:,target]
print(X.shape)
X.head(3)

(156, 6)


Unnamed: 0,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1.34,1.587,0.986,0.596,0.153,0.393
1,1.383,1.573,0.996,0.592,0.252,0.41
2,1.488,1.582,1.028,0.603,0.271,0.341


# Data Analysis #

In [34]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

# initialize KFolds
kf = KFold(n_splits=numFolds, shuffle=True, random_state=randomState)

score = 0
##### cross validation and model evaluation #####
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    ## pre-processing phase
    #define train and test 
    X_train = X.loc[train_idx,:]
    X_test = X.loc[test_idx,:]
    y_train = y.loc[train_idx]
    y_test = y.loc[test_idx]

    ## use k-NN to predict and get r squared
    # create and fit model
    model = LinearRegression().fit(X_train, y_train)

    # predict using new model and compute r_squared value
    y_pred = model.predict(X_test)
    r_squared = model.score(X_test, y_test)

    # sum
    score += r_squared
    
    print(f'r squared for fold {fold} is {r_squared}')

avg_score = score/(fold+1)

print(f'average score is {avg_score}')

r squared for fold 0 is 0.6743605659171716
r squared for fold 1 is 0.7498664860472705
r squared for fold 2 is 0.7404867135145543
r squared for fold 3 is 0.8702585621373055
r squared for fold 4 is 0.7948691048560934
r squared for fold 5 is 0.907648376092676
r squared for fold 6 is 0.6239831559349951
r squared for fold 7 is 0.6245190260881304
r squared for fold 8 is 0.659597734651052
r squared for fold 9 is 0.6523650881191467
average score is 0.7297954813358395


# Double Checking #
Below, I'm checking if this works for train_test_split the way I was thinking (one of the KFolds values and the value below should be very close)

In [32]:
# check single fold calculations to validate this
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize, random_state=randomState)

# create and fit model
model = LinearRegression().fit(X_train, y_train)

# predict using new model and compute r_squared value
y_pred = model.predict(X_test)
r_squared = model.score(X_test, y_test)

print(f'r squared is {r_squared}')

r squared is 0.6743605659171727
