In [0]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#data=pd.read_csv("/Users/pc/Desktop/ML/Data/worldhappiness2019.csv")
#regiondata=pd.read_csv("/Users/pc/Desktop/ML/Data/region.csv")
import io
from google.colab import files
uploaded = files.upload()
data = pd.read_csv(io.BytesIO(uploaded['worldhappiness2019.csv']))
regiondata = pd.read_csv(io.BytesIO(uploaded['region.csv']))


mergedata=pd.merge(data, regiondata, how='left', left_on='Country or region', right_on='name')
# Check for missing values (there won't be any given that I have already cleaned up the region data)
mergedata.loc[pd.isnull(mergedata).iloc[:,9]].to_csv("missing.csv",index=False)

# clean up final region data
X=mergedata.drop(['Happiness_level'],axis=1)
X=X.drop(['name'],axis=1)
X=X.drop(['Country or region'],axis=1)
X=X.drop(['sub-region'],axis=1)

X
df = mergedata

Saving region.csv to region.csv
Saving worldhappiness2019.csv to worldhappiness2019.csv


In [3]:
# Set up training and test data
from sklearn.model_selection import train_test_split

# transfer text labels to integers 

y=df['Happiness_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_train.columns.tolist())

(117, 7)
(117,)
['GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'region']


In [4]:
# integer-label y 
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
lbl_enc = preprocessing.LabelEncoder()
lbl_enc.fit_transform(y_train.values)
lbl_enc.fit_transform(y_test.values)


array([2, 0, 0, 0, 2, 3, 2, 4, 0, 4, 3, 0, 3, 4, 3, 3, 2, 2, 4, 0, 2, 3,
       3, 3, 1, 2, 0, 1, 1, 2, 1, 4, 1, 4, 2, 4, 4, 0, 1])

In [0]:
# Preprocess data using Column Transformer and save fit preprocessor to ".pkl" file
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# create the preprocessing pipelines for both numeric and categorical data.

numeric_features=X.columns.tolist()

numeric_features.remove('region')

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['region']

#Replacing missing values with Modal value and then one hot encoding.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# final preprocessor object set up with ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


#Fit your preprocessor object
prediction_input_preprocessor=preprocessor.fit(X_train) 

import pickle
pickle.dump(prediction_input_preprocessor, open( "preprocessor.pkl", "wb" ) )

In [6]:
# Check shape for keras input:
prediction_input_preprocessor.transform(X_train).shape

(117, 11)

In [11]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from warnings import filterwarnings
filterwarnings('ignore')
kfold = KFold(n_splits = 5)
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {'n_estimators' : [10,90,50],'max_depth':[5,10,15,50], 'min_samples_leaf': [2,4,6]}
rf_grid=GridSearchCV(RandomForestClassifier(criterion = 'gini',oob_score=True),param_grid_rf).fit(prediction_input_preprocessor.transform(X_train), lbl_enc.fit_transform(y_train.values))

test = rf_grid.score(prediction_input_preprocessor.transform(X_test), lbl_enc.fit_transform(y_test.values))
print("Training set score: {:.5f}".format(rf_grid.score(prediction_input_preprocessor.transform(X_train), lbl_enc.fit_transform(y_train.values))))
print("Test set score: {:.5f}".format(test)) 
print("Best Parameter: {}".format(rf_grid.best_params_))
CV= np.mean(cross_val_score(rf_grid, prediction_input_preprocessor.transform(X_train), lbl_enc.fit_transform(y_train.values), cv=kfold)) 
print("Mean Cross Validation, KFold: {:.5f}".format(CV))


Training set score: 0.82906
Test set score: 0.48718
Best Parameter: {'max_depth': 50, 'min_samples_leaf': 4, 'n_estimators': 50}
Mean Cross Validation, KFold: 0.55761


In [12]:
for name, importance in zip(X_train.columns, rf_grid.best_estimator_.feature_importances_):
    print(name, importance)

GDP per capita 0.2127188758733646
Social support 0.1629483886817683
Healthy life expectancy 0.19181565087820757
Freedom to make life choices 0.14542788047580613
Generosity 0.053896752869614664
Perceptions of corruption 0.1329867607225606
region 0.04119225778579521


In [13]:
# using predict_classes() for multi-class data to return predicted class index.

print(rf_grid.predict(prediction_input_preprocessor.transform(X_test)))

prediction_index=rf_grid.predict(prediction_input_preprocessor.transform(X_test))

#Now lets run some code to get keras to return the label rather than the index...

# get labels from one hot encoded y_train data
labels=pd.get_dummies(y_train).columns

# Function to use to return label from column index location
def index_to_label(labels,index_n): 
    return labels[index_n]
    
# Example: return label at predicted index location 1
index_to_label(labels,1)

# Iterate through all predicted indices using map method

predicted_labels=list(map(lambda x: labels[x], prediction_index))
print(predicted_labels)

[0 0 0 1 4 1 2 0 1 2 1 0 3 2 1 1 4 4 4 1 2 3 1 3 1 2 1 1 0 2 0 2 1 4 2 4 4
 4 1]
['Average', 'Average', 'Average', 'High', 'Very Low', 'High', 'Low', 'Average', 'High', 'Low', 'High', 'Average', 'Very High', 'Low', 'High', 'High', 'Very Low', 'Very Low', 'Very Low', 'High', 'Low', 'Very High', 'High', 'Very High', 'High', 'Low', 'High', 'High', 'Average', 'Low', 'Average', 'Low', 'High', 'Very Low', 'Low', 'Very Low', 'Very Low', 'Very Low', 'High']


In [14]:
# Now we can extract some evaluative metrics to use for model submission

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
import pandas as pd
from math import sqrt

def model_eval_metrics(y_true, y_pred,classification="TRUE"):
     if classification=="TRUE":
        accuracy_eval = accuracy_score(y_true, y_pred)
        f1_score_eval = f1_score(y_true, y_pred,average="macro",zero_division=0)
        precision_eval = precision_score(y_true, y_pred,average="macro",zero_division=0)
        recall_eval = recall_score(y_true, y_pred,average="macro",zero_division=0)
        mse_eval = 0
        rmse_eval = 0
        mae_eval = 0
        r2_eval = 0
        metricdata = {'accuracy': [accuracy_eval], 'f1_score': [f1_score_eval], 'precision': [precision_eval], 'recall': [recall_eval], 'mse': [mse_eval], 'rmse': [rmse_eval], 'mae': [mae_eval], 'r2': [r2_eval]}
        finalmetricdata = pd.DataFrame.from_dict(metricdata)
     else:
        accuracy_eval = 0
        f1_score_eval = 0
        precision_eval = 0
        recall_eval = 0
        mse_eval = mean_squared_error(y_true, y_pred)
        rmse_eval = sqrt(mean_squared_error(y_true, y_pred))
        mae_eval = mean_absolute_error(y_true, y_pred)
        r2_eval = r2_score(y_true, y_pred)
        metricdata = {'accuracy': [accuracy_eval], 'f1_score': [f1_score_eval], 'precision': [precision_eval], 'recall': [recall_eval], 'mse': [mse_eval], 'rmse': [rmse_eval], 'mae': [mae_eval], 'r2': [r2_eval]}
        finalmetricdata = pd.DataFrame.from_dict(metricdata)
     return finalmetricdata

model_eval_metrics( y_test,predicted_labels,classification="TRUE")


Unnamed: 0,accuracy,f1_score,precision,recall,mse,rmse,mae,r2
0,0.487179,0.490948,0.572253,0.494444,0,0,0,0
