In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter


In [2]:
# Load the data
file_path = Path('happiness_joined.csv')
happiness_joined_df = pd.read_csv(file_path,encoding="ISO-8859-1")#,index_col=0)
# happiness_joined_df = df.loc[:, columns].copy()

# df.reset_index(inplace=True, drop=True)

happiness_joined_df.head()


Unnamed: 0,country_code,country_name,democracy_index,consumer_price_index,gender_ratio_males_per100_female,infant_mortality_per1000_births,life_expectancy,per_capita_gdp_dollars,population_density,safe_drinking_water_access_pct,seats_held_by_women_pct,unemployment_rate,life_ladder
0,8,Albania,58.9,119.1,103.7,8.0,78.4,"$5,303.00",105.1,70.7,29.3,12.8,5.0
1,12,Algeria,40.1,151.4,102.1,21.2,76.6,"$3,976.00",18.1,72.4,25.8,11.5,4.75
2,51,Armenia,55.4,129.2,88.8,10.8,74.9,"$4,623.00",103.9,86.9,24.2,16.6,5.49
3,40,Austria,82.9,118.1,97.0,3.2,81.4,"$49,701.00",108.7,98.9,37.2,4.8,7.2
4,31,Azerbaijan,27.5,156.9,99.7,20.8,72.8,"$4,782.00",121.6,88.3,16.8,6.0,5.17


In [3]:
happiness_joined_df[['country_code','democracy_index','consumer_price_index','gender_ratio_males_per100_female','infant_mortality_per1000_births','life_expectancy','per_capita_gdp_dollars','population_density','safe_drinking_water_access_pct','seats_held_by_women_pct','unemployment_rate']] = happiness_joined_df[['country_code','democracy_index','consumer_price_index','gender_ratio_males_per100_female','infant_mortality_per1000_births','life_expectancy','per_capita_gdp_dollars','population_density','safe_drinking_water_access_pct','seats_held_by_women_pct','unemployment_rate']].replace({'\$': '', ',': ''}, regex=True).astype(float)
#
happiness_joined_df.head()
# # REF: https://pbpython.com/currency-cleanup.html

Unnamed: 0,country_code,country_name,democracy_index,consumer_price_index,gender_ratio_males_per100_female,infant_mortality_per1000_births,life_expectancy,per_capita_gdp_dollars,population_density,safe_drinking_water_access_pct,seats_held_by_women_pct,unemployment_rate,life_ladder
0,8.0,Albania,58.9,119.1,103.7,8.0,78.4,5303.0,105.1,70.7,29.3,12.8,5.0
1,12.0,Algeria,40.1,151.4,102.1,21.2,76.6,3976.0,18.1,72.4,25.8,11.5,4.75
2,51.0,Armenia,55.4,129.2,88.8,10.8,74.9,4623.0,103.9,86.9,24.2,16.6,5.49
3,40.0,Austria,82.9,118.1,97.0,3.2,81.4,49701.0,108.7,98.9,37.2,4.8,7.2
4,31.0,Azerbaijan,27.5,156.9,99.7,20.8,72.8,4782.0,121.6,88.3,16.8,6.0,5.17


In [4]:
# data types
happiness_joined_df.dtypes

country_code                        float64
country_name                         object
democracy_index                     float64
consumer_price_index                float64
gender_ratio_males_per100_female    float64
infant_mortality_per1000_births     float64
life_expectancy                     float64
per_capita_gdp_dollars              float64
population_density                  float64
safe_drinking_water_access_pct      float64
seats_held_by_women_pct             float64
unemployment_rate                   float64
life_ladder                         float64
dtype: object

In [5]:
# What data is missing?
# find null values
for column in happiness_joined_df.columns:
    print(f"Column{column} has {happiness_joined_df[column].isnull().sum()} null values")

Columncountry_code has 0 null values
Columncountry_name has 0 null values
Columndemocracy_index has 0 null values
Columnconsumer_price_index has 0 null values
Columngender_ratio_males_per100_female has 0 null values
Columninfant_mortality_per1000_births has 0 null values
Columnlife_expectancy has 0 null values
Columnper_capita_gdp_dollars has 0 null values
Columnpopulation_density has 0 null values
Columnsafe_drinking_water_access_pct has 0 null values
Columnseats_held_by_women_pct has 0 null values
Columnunemployment_rate has 0 null values
Columnlife_ladder has 0 null values


In [6]:
# to check for duplicates
print(f"Duplicate entries: {happiness_joined_df.duplicated().sum()}")

Duplicate entries: 0


## Splitting the Dataset into the Independent Feature Matrix:

In [7]:
happiness_joined_df.columns.values.tolist()

['country_code',
 'country_name',
 'democracy_index',
 'consumer_price_index',
 'gender_ratio_males_per100_female',
 'infant_mortality_per1000_births',
 'life_expectancy',
 'per_capita_gdp_dollars',
 'population_density',
 'safe_drinking_water_access_pct',
 'seats_held_by_women_pct',
 'unemployment_rate',
 'life_ladder']

In [8]:
# Use get_dummies() to create variables for text features.
# YOUR CODE HERE
# X = happiness_joined_df.copy()
# X = pd.get_dummies(X, columns = ["Algorithm", "ProofType"])
# X

In [9]:
# select features 
X = happiness_joined_df[['country_code','democracy_index','consumer_price_index','gender_ratio_males_per100_female','infant_mortality_per1000_births','life_expectancy','per_capita_gdp_dollars','population_density','safe_drinking_water_access_pct','seats_held_by_women_pct','unemployment_rate']]
y = happiness_joined_df["life_ladder"]
# ref: https://www.pluralsight.com/guides/importing-and-splitting-data-into-dependent-and-independent-features-for-ml
# ref: https://towardsdatascience.com/multiple-regression-as-a-machine-learning-algorithm-a98a6b9f307b

In [10]:
print(happiness_joined_df.shape)

(91, 13)


## Modeling

In [14]:
from sklearn import linear_model
# instantiate model
model = linear_model.LinearRegression()
# fit model
model.fit(X, y)

# fit
model.fit(X_train, y_train)
# predict
y_pred = model.predict(X_test)

In [13]:
# Normal train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [15]:
# Instantiate a linear SVM model
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(68, 11)
(23, 11)
(68,)
(23,)


In [17]:
from sklearn.preprocessing import StandardScaler
# 3) Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# 4) Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [18]:
print(np.mean(X_train_scaled[:,0]))
print(np.mean(X_test_scaled[:,0]))
print(np.std(X_train_scaled[:,0]))
print(np.std(X_test_scaled[:,0]))

3.7347576012206556e-17
0.3803815653424447
1.0
0.937109311161882


In [19]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=78) 

In [20]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

ValueError: Unknown label type: 'continuous'

In [23]:
# model evaluation
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)
print(score)
# this is R^2 value!!!!

0.30200177624084346
