In [None]:
import warnings
warnings.simplefilter(action='ignore', 
                      category=FutureWarning)      # suppress warnings
import numpy as np                                 # linear algebra
import pandas as pd                                # data analysis
import matplotlib.pyplot as plt                    # visualization - plt means we don't have to use the full pyplot every time we use it.
%matplotlib inline
import seaborn as sns                              # visualization
import scipy.stats as scipystats                   # statistics  
import statsmodels.formula.api as smf              # statistics
from statsmodels.api import add_constant           # statistics
from sklearn.feature_selection import SelectKBest  # feature selection
from sklearn.feature_selection import f_regression # feature selection

pd.set_option('display.float_format', lambda x: '%.1f' % x) # format decimals
sns.set(font_scale=1.5) # increse font size for seaborn charts

print("Setup Complete")

In [None]:
file_path = "../input/world-happiness/2019.csv"
data = pd.read_csv(file_path)
data.head()

In [None]:
#Check NA
data.isnull().sum()

In [None]:
sns.distplot(a=data['Score'], label="Score", kde=False)
sns.distplot(a=data['Social support'], label="Social support", kde=False)
sns.distplot(a=data['Generosity'], label="Generosity", kde=False)

In [None]:
sns.lmplot(x="Score", y="Social support", data=data, alpha, scatter_kws={'alpha':0.15})
sns.set_style("darkgrid")


In [None]:
sns.regplot(x=, lowess=True, data=df, scatter_kws={'alpha':0.15}, line_kws={'color': 'red'})

In [None]:

plt.plot(data['Social support'], data['Perceptions of corruption'], color='#c3fdff', marker='.', linestyle='--')
plt.title('Perceptions of corruption vs generosity')
plt.xlabel('Corruption')
plt.ylabel('Generosity')
ax = plt.axes()
ax.set_facecolor('#2f3952') #Change background colour
plt.style.use('fivethirtyeight')

plt.legend()  #Automatically the label from the top will be put into the legend.

plt.style.available
#plt.savefig('plot.png') #- save it to current directory

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(8, 5))
plt.rcParams['figure.dpi'] = 200
plt.tight_layout()



In [None]:
s = (data.dtypes == 'object') #TAKE OUT COUNTRY OR REGION
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
#Label Encoding of Categorical Variables
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
for col in object_cols:
    data[col] = label_encoder.fit_transform(data[col])

In [None]:
data.corr()['Score'].sort_values()

GDP seems to be the most highly correlated factor.
Let's visualise on a heatmap

In [None]:
#First select the top 10 based on overall rank
top_10 = data.loc[data['Overall rank'] <= 10]
top_10


In [None]:
#To improve
sns.heatmap(top_10, linewidths=0.1,cbar=True, annot=True, square=True, fmt='.1f')

In [None]:
#2D Kernel Density Plot to investigate social security and total score
sns.jointplot(x=data['Score'], y=data['Social support'], kind="kde")


In [None]:
data.groupby('Generosity').head()

MORE VISUALISATION


In [None]:
#Using Random Forest
from sklearn.model_selection import train_test_split
y=data.Score
happiness_features = ['Generosity', 'Social support', 'Perceptions of corruption']
X = data[happiness_features]
X.describe


In [None]:
#Define Model
from sklearn.tree import DecisionTreeRegressor
basic_model=DecisionTreeRegressor(random_state=1)
basic_model.fit(X, y)

In [None]:
#Predictions
print("Making predictions for happiness scores:")
print(X.head())
print("The predictions are")
print(basic_model.predict(X.head()))

In [None]:
#Basic Validation using in-sample score
from sklearn.metrics import mean_absolute_error

predicted_happiness_scores = basic_model.predict(X)
mean_absolute_error(y, predicted_happiness_score)

In [None]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

basic_model = DecisionTreeRegressor()

basic_model.fit(train_X, train_y)

# get predicted prices on validation data
basic_val_predictions = basic_model.predict(val_X)
basic_score = mean_absolute_error(val_y, basic_val_predictions)
print(basic_score)

In [None]:
# Random Forest Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)
happiness_preds = rf_model.predict(val_X)
rf_score = mean_absolute_error(val_y, happiness_preds)
print(rf_score)

In [None]:
#Calculate max leaf nodes to optimise model
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

In [None]:
#Using Pipelines before using Cross-Validation and XGBoosting to optimise predictions
#First we bundle preprocessing and modelling

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#NUMERICAL DATA
#numerical_transformer = SimpleImputer(strategy='constant')
#CATEGORICAL DATA
#categorical_transformer = Pipeline(steps=[
#    ('imputer', SimpleImputer(strategy='most_frequent')),
#    ('one hot', OneHotEncoder())
#])

#PREPROCESSING
#preprocessor = ColumnTransformer(
#transformers=[
#    ('num', numerical_transformer, numerical_cols)
#    ('cat', categorical_transformer, categorical_cols)
#])

In [None]:
#Define Model
#from sklearn.ensemble import RansomForestRegressor
#model = RandomForestRegressor(n=estimators=100, random_state=0)

In [None]:
#Bundle preprocessing and modelling into pipeline

from sklearn.metrics import mean_absolute_error

#Preprocessing and model bundled
#my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
 #                            ('model', model)
  #                           ])

#fit model to training data
#my_pipeline.fit(X_train, y_train)

#Get Preds
#preds = my_pipeline.predict(X_valid)

#Evaluate
#score = mean_absolute_error(y_valid, preds)
#print(score)