In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('../input/the-human-freedom-index/hfi_cc_2020.csv')
df.head()

#Data Preprocessing

In [None]:
# Remove column 'Unnamed: 0'
df.drop(columns=df.columns[0],axis=1,inplace=True)
df.head()

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
column_names=list(df.columns)
print(column_names)

In [None]:
df.describe()

## Handle Missing Data

In [None]:
df.isnull().sum()

In [None]:
# Fill NaN values of an attribute, with the mean of the attribute grouped countries wise
nan_columns = df.isnull().any()
columns_with_nan=df.columns[nan_columns].tolist()
print(columns_with_nan)

for column in columns_with_nan:
    df[column].fillna(df.groupby('countries')[column].transform('mean'),inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# Some countries have no vales at all, for certain columns
df[columns_with_nan].isnull().any()

In [None]:
# Fill remaining Nan values with column mean
for column in columns_with_nan:
    mean_value=df[column].mean()
    df[column].fillna(value=mean_value, inplace=True)

In [None]:
df[columns_with_nan].isnull().any()

## Data Overview

In [None]:
# Obtain Categorical column names
categorical_columns=[i for i in df.columns if df.dtypes[i]=='object']
print(categorical_columns)

In [None]:
# Get country names and region names
country_names=list(df['countries'].value_counts().index)
region_names=list(df['region'].value_counts().index)

In [None]:
# Get number of countries
print(len(country_names))

In [None]:
# Check value_counts for countries (same as for ISO_code)
df['countries'].value_counts()

In [None]:
# Check value_counts for region
df['region'].value_counts()

In [None]:
# Check value_counts for numerical attributes
for column in column_names[4:]:
    print("Column: "+column)
    print(df[column].value_counts())
    print()

##Target Value Statistics

In [None]:
# Check statistics of hf_score
df.hf_score.describe()

In [None]:
# Check statistics of pf_score
df.pf_score.describe()

In [None]:
# Check statistics of ef_score
df.ef_score.describe()

# Exploratory Data Analysis

## Target Trends

In [None]:
# Set plot size
sns.set(rc={'figure.figsize':(100,50)})

In [None]:
# Plot hf_score trend
sns.lineplot(data=df, x="year", y="hf_score",hue="countries")
plt.legend(fontsize=30)
plt.legend(fontsize="xx-large")
plt.legend(loc='upper right')

In [None]:
sns.lineplot(data=df, x="year", y="hf_score",hue="region")
plt.legend(fontsize=25)
plt.legend(fontsize="x-large")

In [None]:
# Plot pf_score trend
sns.lineplot(data=df, x="year", y="pf_score",hue="countries")
plt.legend(fontsize=30)
plt.legend(fontsize="xx-large")
plt.legend(loc='upper right')

In [None]:
sns.lineplot(data=df, x="year", y="pf_score",hue="region")
plt.legend(fontsize=25)
plt.legend(fontsize="x-large")

In [None]:
# Plot ef_score trend
sns.lineplot(data=df, x="year", y="ef_score",hue="countries")
plt.legend(fontsize=30)
plt.legend(fontsize="xx-large")
plt.legend(loc='upper right')

In [None]:
sns.lineplot(data=df, x="year", y="ef_score",hue="region")
plt.legend(fontsize=25)
plt.legend(fontsize="x-large")

In [None]:
# View countries based on hf_rank
df[['hf_rank','countries','hf_score']].sort_values(by = 'hf_rank')

## Attribute Analysis

In [None]:
# Plot of each attribute against year, region wise
for column in column_names[4:]:
    plt.figure(figsize=(100,50))
    sns.catplot(x="region", y=column, hue="year", kind="bar", aspect=100/20, data=df)
    plt.title(column+" (avg) vs year")
    plt.ylabel(column)
    plt.xticks(rotation=90) 
    plt.show()

In [None]:
# Overall Top 20 and Bottom 20 countries for each score
for column in column_names[4:]:
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(23,5))
    df.groupby(['countries'])[column].mean().sort_values(ascending=False).head(20).plot.bar(ax=ax1)
    ax1.set_title(column+"(avg) Top 20")
    df.groupby(['countries'])[column].mean().sort_values(ascending=False).tail(20).plot.bar(ax=ax2)
    ax2.set_title(column+" (avg) Bottom 20")
    plt.ylabel(column)
    plt.xticks(rotation=90) 
    plt.show()


## Attribute Correlation 

In [None]:
# Get the list of indicators, other than ranks and quartile
attributes=set(column_names[4:])
not_to_check=set(['hf_rank','hf_quartile','pf_rank','ef_rank'])
attributes=sorted(list(set(attributes) - set(not_to_check)))

In [None]:
# Check correlation of attributes
plt.figure(figsize=(100,50))
correlation=df[attributes].corr()
sns.heatmap(correlation,annot=True)

In [None]:
corr_values=pd.DataFrame(correlation[correlation < 1].unstack().transpose().sort_values( ascending=False).drop_duplicates())
corr_values

In [None]:
# Obtain positive correlations for moderate and strong relations correlation>=0.5
positive_corr=corr_values[corr_values[0]>=0.75]
positive_corr

In [None]:
# Obtain negative correlations for moderate relations correlation<=0.5
negative_corr=corr_values[corr_values[0]<=-0.5]
negative_corr

In [None]:
# Paiplot between above attributes, to check dependency plt.figure(figsize=(20,20))
#sns.pairplot(df[attributes])
#plt.show()

# Target Prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [None]:
# personal_attr has indicators of personal freedom, excluding pf_rank
personal_attr=[col for col in df.columns if col.startswith('pf_')]
personal_attr.remove('pf_rank')

# economic_attr has indicators of economic freedom, excluding ef_rank
economic_attr=[col for col in df.columns if col.startswith('ef_')]
economic_attr.remove('ef_rank')

# total_attr has all numerical attributes excluding hf_rank, ef_rank, pf_rank, hf_quartile, and year
total_attr=personal_attr+economic_attr
total_attr.append('hf_score')

## Personal Freedom Data

In [None]:
pf=df[personal_attr]
pf.head()

In [None]:
pf.isnull().sum()

In [None]:
size=np.linspace(0.65, 0.85, 100)
#print(size)

scaler=StandardScaler()

train_accuracies=[]
test_accuracies=[]

for t_size in size:
    Train, Test=train_test_split(pf,train_size=t_size,random_state=42)

    Y_train=Train.pop('pf_score')
    X_train=Train 
    Y_test=Test.pop('pf_score')
    X_test=Test   

    X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])
    X_test[X_test.columns]=scaler.fit_transform(X_test[X_test.columns])
    
    linear_model=LinearRegression()
    linear_model.fit(X_train,Y_train)
    train_accuracies.append(linear_model.score(X_train,Y_train))
    test_accuracies.append(linear_model.score(X_test,Y_test))

In [None]:
print("Maximum Train accuracy: ",max(train_accuracies))
print("for train size",size[train_accuracies.index(max(train_accuracies))])
print("Corresponding Test accuracy: ",test_accuracies[train_accuracies.index(max(train_accuracies))])

In [None]:
print("Maximum Test accuracy: ",max(test_accuracies))
print("for train size",size[test_accuracies.index(max(test_accuracies))])
print("Corresponding Train accuracy: ",train_accuracies[test_accuracies.index(max(test_accuracies))])

In [None]:
# Highest accuracy is given by train_size=0.65
Train, Test=train_test_split(pf,train_size=0.65,random_state=42)

In [None]:
# Separate Dependent and Independent variables
Y_train=Train.pop('pf_score')
X_train=Train 

Y_test=Test.pop('pf_score')
X_test=Test 

In [None]:
# Standardize the independent features in a fixed range to the same scale by Standard Scaling
scaler=StandardScaler()

X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])
X_train.describe()

In [None]:
X_test[X_test.columns]=scaler.fit_transform(X_test[X_test.columns])
X_test.describe()

In [None]:
# Build Linear Regression Model
linear_model=LinearRegression()
linear_model.fit(X_train,Y_train)

In [None]:
# Ckeck r_square score (goodness of fit) for Train data - Accuracy %   
linear_model.score(X_train,Y_train)

In [None]:
# Ckeck r_square score (goodness of fit) for Test data - Accuracy %    
linear_model.score(X_test,Y_test)

In [None]:
from sklearn.metrics import mean_squared_error

# Check error in Training model
y_train_predict=linear_model.predict(X_train)
rms_err=(np.sqrt(mean_squared_error(Y_train,y_train_predict)))
print("Train RMSE: {}".format(rms_err) )

# Check error in Test model
y_test_predict=linear_model.predict(X_test)
rms_err=(np.sqrt(mean_squared_error(Y_test,y_test_predict)))
print("Test RMSE: {}".format(rms_err) )

In [None]:
# Plot Regression line obtained
plt.figure(figsize=(10,10))
plt.scatter(Y_test,y_test_predict)
plt.plot([min(y_test_predict),max(y_test_predict)],[min(y_test_predict),max(y_test_predict)])
plt.xlabel('Actual')
plt.ylabel('Prediction')

In [None]:
# Equation Parameters
print("Coeffecients:",linear_model.coef_)
print("Intercept",linear_model.intercept_)

In [None]:
# Plot Distribution plot of Residuals should be normal - true
plt.figure(figsize=(10,5))
target_prediction = linear_model.predict(X_train)
residual = Y_train - target_prediction
sns.distplot(residual)
plt.xlabel('Residuals')
plt.title("Residual Analysis",fontsize=20)
plt.show()

# Residuals must not be related - true
sns.scatterplot(x=residual,y=target_prediction)
plt.xlabel('Residuals')
plt.title("Residual Analysis",fontsize=20)
plt.show()

In [None]:
# Model Equation
model = str(linear_model.intercept_)

for i in range(len(linear_model.coef_)):
    model = model +' + '  +(str(linear_model.coef_[i])) + ' * ' +(str(X_train.columns[i]))
print("pf_score =",model)


## Economic Freedom Data

In [None]:
ef=df[economic_attr]
ef.head()

In [None]:
ef.isnull().sum()

In [None]:
size=np.linspace(0.65, 0.85, 100)
#print(size)

scaler=StandardScaler()

train_accuracies=[]
test_accuracies=[]

for t_size in size:
    Train, Test=train_test_split(ef,train_size=t_size,random_state=42)

    Y_train=Train.pop('ef_score')
    X_train=Train 
    Y_test=Test.pop('ef_score')
    X_test=Test   

    X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])
    X_test[X_test.columns]=scaler.fit_transform(X_test[X_test.columns])
    
    linear_model=LinearRegression()
    linear_model.fit(X_train,Y_train)
    train_accuracies.append(linear_model.score(X_train,Y_train))
    test_accuracies.append(linear_model.score(X_test,Y_test))

In [None]:
print("Maximum Train accuracy: ",max(train_accuracies))
print("for train size",size[train_accuracies.index(max(train_accuracies))])
print("Corresponding Test accuracy: ",test_accuracies[train_accuracies.index(max(train_accuracies))])

In [None]:
print("Maximum Test accuracy: ",max(test_accuracies))
print("for train size",size[test_accuracies.index(max(test_accuracies))])
print("Corresponding Train accuracy: ",train_accuracies[test_accuracies.index(max(test_accuracies))])

In [None]:
# Highest accuracy is given by train_size=0.65
Train, Test=train_test_split(ef,train_size=0.65,random_state=42)

In [None]:
# Separate Dependent and Independent variables
Y_train=Train.pop('ef_score')
X_train=Train 

Y_test=Test.pop('ef_score')
X_test=Test 

In [None]:
# Standardize the independent features in a fixed range to the same scale by Standard Scaling
scaler=StandardScaler()

X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])
X_train.describe()

In [None]:
X_test[X_test.columns]=scaler.fit_transform(X_test[X_test.columns])
X_test.describe()

In [None]:
# Build Linear Regression Model
linear_model=LinearRegression()
linear_model.fit(X_train,Y_train)

In [None]:
# Ckeck r_square score (goodness of fit) for Train data - Accuracy %   
linear_model.score(X_train,Y_train)

In [None]:
# Ckeck r_square score (goodness of fit) for Test data - Accuracy %    
linear_model.score(X_test,Y_test)

In [None]:
from sklearn.metrics import mean_squared_error

# Check error in Training model
y_train_predict=linear_model.predict(X_train)
rms_err=(np.sqrt(mean_squared_error(Y_train,y_train_predict)))
print("Train RMSE: {}".format(rms_err) )

# Check error in Test model
y_test_predict=linear_model.predict(X_test)
rms_err=(np.sqrt(mean_squared_error(Y_test,y_test_predict)))
print("Test RMSE: {}".format(rms_err) )

In [None]:
# Plot Regression line obtained
plt.figure(figsize=(10,10))
plt.scatter(Y_test,y_test_predict)
plt.plot([min(y_test_predict),max(y_test_predict)],[min(y_test_predict),max(y_test_predict)])
plt.xlabel('Actual')
plt.ylabel('Prediction')

In [None]:
# Equation Parameters
print("Coeffecients:",linear_model.coef_)
print("Intercept",linear_model.intercept_)

In [None]:
# Plot Distribution plot of Residuals should be normal - true
plt.figure(figsize=(10,5))
target_prediction = linear_model.predict(X_train)
residual = Y_train - target_prediction
sns.distplot(residual)
plt.xlabel('Residuals')
plt.title("Residual Analysis",fontsize=20)
plt.show()

# Residuals must not be related - true
sns.scatterplot(x=residual,y=target_prediction)
plt.xlabel('Residuals')
plt.title("Residual Analysis",fontsize=20)
plt.show()

In [None]:
# Model Equation
model = str(linear_model.intercept_)

for i in range(len(linear_model.coef_)):
    model = model +' + '  +(str(linear_model.coef_[i])) + ' * ' +(str(X_train.columns[i]))
print("ef_score =",model)

## Human Freedom Data

### Based on scores

In [None]:
hf=df[['hf_score','pf_score','ef_score']]
hf.head()

In [None]:
hf.corr()

In [None]:
hf.isnull().sum()

In [None]:
size=np.linspace(0.65, 0.85, 100)
#print(size)

scaler=StandardScaler()

train_accuracies=[]
test_accuracies=[]

for t_size in size:
    Train, Test=train_test_split(hf,train_size=t_size,random_state=42)

    Y_train=Train.pop('hf_score')
    X_train=Train 
    Y_test=Test.pop('hf_score')
    X_test=Test   

    X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])
    X_test[X_test.columns]=scaler.fit_transform(X_test[X_test.columns])
    
    linear_model=LinearRegression()
    linear_model.fit(X_train,Y_train)
    train_accuracies.append(linear_model.score(X_train,Y_train))
    test_accuracies.append(linear_model.score(X_test,Y_test))

In [None]:
print("Maximum Train accuracy: ",max(train_accuracies))
print("for train size",size[train_accuracies.index(max(train_accuracies))])
print("Corresponding Test accuracy: ",test_accuracies[train_accuracies.index(max(train_accuracies))])

In [None]:
print("Maximum Test accuracy: ",max(test_accuracies))
print("for train size",size[test_accuracies.index(max(test_accuracies))])
print("Corresponding Train accuracy: ",train_accuracies[test_accuracies.index(max(test_accuracies))])

In [None]:
# Highest accuracy is given by train_size=0.65
Train, Test=train_test_split(hf,train_size=0.65,random_state=42)

In [None]:
# Separate Dependent and Independent variables
Y_train=Train.pop('hf_score')
X_train=Train 

Y_test=Test.pop('hf_score')
X_test=Test 

In [None]:
# Standardize the independent features in a fixed range to the same scale by Standard Scaling
scaler=StandardScaler()

X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])
X_train.describe()

In [None]:
X_test[X_test.columns]=scaler.fit_transform(X_test[X_test.columns])
X_test.describe()

In [None]:
# Build Linear Regression Model
linear_model=LinearRegression()
linear_model.fit(X_train,Y_train)

In [None]:
# Ckeck r_square score (goodness of fit) for Train data - Accuracy %   
linear_model.score(X_train,Y_train)

In [None]:
# Ckeck r_square score (goodness of fit) for Test data - Accuracy %    
linear_model.score(X_test,Y_test)

In [None]:
from sklearn.metrics import mean_squared_error

# Check error in Training model
y_train_predict=linear_model.predict(X_train)
rms_err=(np.sqrt(mean_squared_error(Y_train,y_train_predict)))
print("Train RMSE: {}".format(rms_err) )

# Check error in Test model
y_test_predict=linear_model.predict(X_test)
rms_err=(np.sqrt(mean_squared_error(Y_test,y_test_predict)))
print("Test RMSE: {}".format(rms_err) )

In [None]:
# Plot Regression line obtained
plt.figure(figsize=(10,10))
plt.scatter(Y_test,y_test_predict)
plt.plot([min(y_test_predict),max(y_test_predict)],[min(y_test_predict),max(y_test_predict)])
plt.xlabel('Actual')
plt.ylabel('Prediction')

In [None]:
# Equation Parameters
print("Coeffecients:",linear_model.coef_)
print("Intercept",linear_model.intercept_)

In [None]:
# Plot Distribution plot of Residuals should be normal - false
plt.figure(figsize=(10,5))
target_prediction = linear_model.predict(X_train)
residual = Y_train - target_prediction
sns.distplot(residual)
plt.xlabel('Residuals')
plt.title("Residual Analysis",fontsize=20)
plt.show()

# Residuals must not be related - false
sns.scatterplot(x=residual,y=target_prediction)
plt.xlabel('Residuals')
plt.title("Residual Analysis",fontsize=20)
plt.show()

### Based on parameters

In [None]:
total_attr.remove('pf_score')
total_attr.remove('ef_score')

In [None]:
hf=df[total_attr]
hf.head()

In [None]:
hf.isnull().sum()

In [None]:
size=np.linspace(0.65, 0.85, 100)
#print(size)

scaler=StandardScaler()

train_accuracies=[]
test_accuracies=[]

for t_size in size:
    Train, Test=train_test_split(hf,train_size=t_size,random_state=42)

    Y_train=Train.pop('hf_score')
    X_train=Train 
    Y_test=Test.pop('hf_score')
    X_test=Test   

    X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])
    X_test[X_test.columns]=scaler.fit_transform(X_test[X_test.columns])
    
    linear_model=LinearRegression()
    linear_model.fit(X_train,Y_train)
    train_accuracies.append(linear_model.score(X_train,Y_train))
    test_accuracies.append(linear_model.score(X_test,Y_test))

In [None]:
print("Maximum Train accuracy: ",max(train_accuracies))
print("for train size",size[train_accuracies.index(max(train_accuracies))])
print("Corresponding Test accuracy: ",test_accuracies[train_accuracies.index(max(train_accuracies))])

In [None]:
print("Maximum Test accuracy: ",max(test_accuracies))
print("for train size",size[test_accuracies.index(max(test_accuracies))])
print("Corresponding Train accuracy: ",train_accuracies[test_accuracies.index(max(test_accuracies))])

In [None]:
# Highest accuracy is given by train_size=0.65
Train, Test=train_test_split(hf,train_size=0.65,random_state=42)

In [None]:
# Separate Dependent and Independent variables
Y_train=Train.pop('hf_score')
X_train=Train 

Y_test=Test.pop('hf_score')
X_test=Test 

In [None]:
# Standardize the independent features in a fixed range to the same scale by Standard Scaling
scaler=StandardScaler()

X_train[X_train.columns]=scaler.fit_transform(X_train[X_train.columns])
X_train.describe()

In [None]:
X_test[X_test.columns]=scaler.fit_transform(X_test[X_test.columns])
X_test.describe()

In [None]:
# Build Linear Regression Model
linear_model=LinearRegression()
linear_model.fit(X_train,Y_train)

In [None]:
# Ckeck r_square score (goodness of fit) for Train data - Accuracy %   
linear_model.score(X_train,Y_train)

In [None]:
# Ckeck r_square score (goodness of fit) for Test data - Accuracy %    
linear_model.score(X_test,Y_test)

In [None]:
from sklearn.metrics import mean_squared_error

# Check error in Training model
y_train_predict=linear_model.predict(X_train)
rms_err=(np.sqrt(mean_squared_error(Y_train,y_train_predict)))
print("Train RMSE: {}".format(rms_err) )

# Check error in Test model
y_test_predict=linear_model.predict(X_test)
rms_err=(np.sqrt(mean_squared_error(Y_test,y_test_predict)))
print("Test RMSE: {}".format(rms_err) )

In [None]:
# Plot Regression line obtained
plt.figure(figsize=(10,10))
plt.scatter(Y_test,y_test_predict)
plt.plot([min(y_test_predict),max(y_test_predict)],[min(y_test_predict),max(y_test_predict)])
plt.xlabel('Actual')
plt.ylabel('Prediction')

In [None]:
# Equation Parameters
print("Coeffecients:",linear_model.coef_)
print("Intercept",linear_model.intercept_)

In [None]:
# Plot Distribution plot of Residuals should be normal - true
plt.figure(figsize=(10,5))
target_prediction = linear_model.predict(X_train)
residual = Y_train - target_prediction
sns.distplot(residual)
plt.xlabel('Residuals')
plt.title("Residual Analysis",fontsize=20)
plt.show()

# Residuals must not be related - true
sns.scatterplot(x=residual,y=target_prediction)
plt.xlabel('Residuals')
plt.title("Residual Analysis",fontsize=20)
plt.show()

In [None]:
# Model Equation
model = str(linear_model.intercept_)

for i in range(len(linear_model.coef_)):
    model = model +' + '  +(str(linear_model.coef_[i])) + ' * ' +(str(X_train.columns[i]))
print("hf_score =",model)