# Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# missing value treatment
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

# normal or not normal distribution
from scipy.stats import kstest

# scaling
from sklearn.preprocessing import StandardScaler

# country to continent (as numeric classes)
import pycountry_convert as pc

# importing metrics
from sklearn.metrics import r2_score

# vif check
from statsmodels.stats.outliers_influence import variance_inflation_factor

# train-test split
from sklearn.model_selection import train_test_split

# hyperparameter tuning
from sklearn.model_selection import cross_val_score
import optuna

# for stacking regressor model
from mlxtend.regressor import StackingCVRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Read the Data

In [2]:
life_expectancy_data = pd.read_csv(r'C:\Users\Shahbaz\Desktop\socool\exam\regression\Life Expectancy Data.csv')
data = life_expectancy_data.copy()

In [3]:
data.head()

Unnamed: 0,Country,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Life expectancy
0,Afghanistan,2015,Developing,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,65.0
1,Afghanistan,2014,Developing,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,59.9
2,Afghanistan,2013,Developing,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,59.9
3,Afghanistan,2012,Developing,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,59.5
4,Afghanistan,2011,Developing,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,59.2


In [4]:
def accurate_columns(data):
    new_columns = []
    for col in data.columns:
        col = col.strip()
        if col[0].islower():
            col = col.capitalize()
        new_columns.append(col)
    data.columns = new_columns
    return data

In [5]:
data = accurate_columns(data)

In [6]:
data.head()

Unnamed: 0,Country,Year,Status,Adult Mortality,Infant deaths,Alcohol,Percentage expenditure,Hepatitis B,Measles,BMI,Under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness 1-19 years,Thinness 5-9 years,Income composition of resources,Schooling,Life expectancy
0,Afghanistan,2015,Developing,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,65.0
1,Afghanistan,2014,Developing,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,59.9
2,Afghanistan,2013,Developing,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,59.9
3,Afghanistan,2012,Developing,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,59.5
4,Afghanistan,2011,Developing,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,59.2


### Handling missing values

In [7]:
def fill_num_columns(data, target):
    data = data.dropna(subset=[target], axis=0)
    
    columns_with_nulls = data.isnull().sum()
    columns_with_nulls = columns_with_nulls[columns_with_nulls > 0]
    null_df = pd.DataFrame({
        'Column': columns_with_nulls.index,
        'Null Count': columns_with_nulls.values,
        'Dtype': data[columns_with_nulls.index].dtypes.values
    })
    
    # Set threshold for iterative imputation
    threshold = data.shape[0] * 0.1
    
    # Separate columns for mean imputation and iterative imputation
    columns_mean_impute = null_df.loc[null_df['Null Count'] <= threshold, 'Column'].tolist()
    columns_iterative_impute = null_df.loc[null_df['Null Count'] > threshold, 'Column'].tolist()
    
    # Numeric columns for imputation
    data_filled = data.copy()
    
    # Impute missing values with mean for columns with low missing counts
    mean_imputer = SimpleImputer(strategy='mean')
    data_filled.loc[:, columns_mean_impute] = mean_imputer.fit_transform(data_filled.loc[:, columns_mean_impute])
    
    # Impute missing values iteratively for columns with high missing counts
    iterative_imputer = IterativeImputer(random_state=42, max_iter=10)
    data_filled.loc[:, columns_iterative_impute] = iterative_imputer.fit_transform(data_filled.loc[:, columns_iterative_impute])
    
    return data_filled


### Outlier treatment

In [8]:
def outlier_treat(data):
    numeric_data = data.select_dtypes(exclude='object')

    q1 = numeric_data.quantile(0.25)
    q3 = numeric_data.quantile(0.75)
    IQR = q3 - q1

    lower = q1 - 1.5 * IQR
    upper = q3 + 1.5 * IQR

    for i in data.columns:
        if data[i].dtypes != object:
            data[i] = np.where(data[i]<lower[i], lower[i], data[i])
            data[i] = np.where(data[i]>upper[i], upper[i], data[i])
    return data

In [9]:
def show_outliers(data):
    for i in data.columns:
        if data[i].dtypes != object:
            sns.boxplot(x=data[i], data=data)
            plt.title(f'Boxplot of {i}')
            plt.show()

### Convert Country to Numeric
- there I found 2 methods but the second one worked better

In [10]:
# !pip install geonamescache

In [11]:
# !pip install pycountry_convert

In [12]:
def convert_country(df):
    country_to_continent = {}

    for country_name in df['Country'].unique():
        try:
            # Try to get ISO alpha-2 code for the country name
            country_alpha2 = pc.country_name_to_country_alpha2(country_name)
            # Get continent code based on the ISO alpha-2 code
            continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
            country_to_continent[country_name] = continent_code
            
        except (KeyError, ValueError) as e:
            for x in country_name.split():
                try:
                    country_alpha2 = pc.country_name_to_country_alpha2(x)
                    # Get continent code based on the ISO alpha-2 code
                    continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
                    country_to_continent[country_name] = continent_code
                except:
                    # print(f"Error mapping country '{country_name}' to continent: {e}")
                    country_to_continent[country_name] = 'unknown'
            
    # Map the Continent based on the updated dictionary
    df['Continent'] = df['Country'].map(country_to_continent)
    df = df.drop('Country', axis=1)
    return df

### Convert Categoric to Numeric

In [13]:
def cat_to_num(data):
    data = data.fillna('missing')
    data = pd.get_dummies(data, drop_first=True).astype(int)
    return data

### Creating new features

In [14]:
def feature_engineering(data):
    stat_methods = {
        'mean': np.mean,
        'median': np.median,
        'min': np.min,
        'max': np.max,
        'std': np.std,
        'var': np.var
    }
    
    # Select calculation columns (excluding grouping columns)
    calculation_cols = [x for x in data.columns if x not in ['Continent', 'Year', 'Status']]
    result_data = data.copy()
    column_groups = {
        ('Continent',): calculation_cols,
        ('Year',): calculation_cols,
        ('Status',): calculation_cols,
        ('Continent', 'Year'): calculation_cols,
        ('Continent', 'Status'): calculation_cols,
        ('Status', 'Year'): calculation_cols
    }
    
    for stat_name, stat_func in stat_methods.items():
        for group_cols, calc_cols in column_groups.items():
            print(f"Processing: {stat_name}, Grouping: {group_cols}")
            
            grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
            
            agg_col_names = [f'{col}_{stat_name}_by_' + '_'.join(group_cols) for col in calc_cols]
            grouped_data.columns = list(group_cols) + agg_col_names
            
            # Merge aggregated data back into the result_data dataframe on group_cols
            result_data = pd.merge(result_data, grouped_data, on=list(group_cols), how='left')
    
    return result_data


### Getting columns which is correlated with target

In [15]:
def target_correlation(data, target, threshold_normality=0.05, threshold_correlation=0.45):
    normal_dist = []
    not_normal_dist = []

    # Identify columns based on normality
    for col in data.columns:
        if data[col].dtype in ['int64', 'float64']:
            kstest_statistic, kstest_p_value = kstest(data[col], 'norm')
            if kstest_p_value > threshold_normality:
                normal_dist.append(col)
            else:
                not_normal_dist.append(col)

    # Calculate Spearman correlation with target for non-normally distributed columns
    corr_matrix = data.corr(method='spearman', numeric_only=True)
    variables_explaining_target = []

    for col in not_normal_dist:
        if col != target:
            spearman_corr = corr_matrix.loc[col, target]
            if abs(spearman_corr) > threshold_correlation:
                variables_explaining_target.append(col)

    # Include target, numeric columns explaining target, and non-numeric columns in the output
    result_cols = variables_explaining_target + [target] + list(data.select_dtypes(exclude='number').columns)
    
    return result_cols

# Intercorrelation check

In [16]:
def intercorrelation(data, target, threshold=0.5):
    normal_dist = []
    not_normal_dist = []

    for col in data.columns:
        if data[col].dtype in ['int64', 'float64']:
            kstest_statistic, kstest_p_value = kstest(data[col], 'norm')
            if kstest_p_value > threshold:
                normal_dist.append(col)
            else:
                not_normal_dist.append(col)
                
    for col in not_normal_dist:
        if col != target:
            corr_matrix = data.drop(columns=[target]).corr(method='spearman', numeric_only=True)
            
            low_correlated_variables = (np.abs(corr_matrix) < threshold) & (corr_matrix != 1.0)
            
            independent_variable_pairs = np.where(low_correlated_variables)
                
            not_intercorrelated_independent = pd.DataFrame({
                'Variable': corr_matrix.index[independent_variable_pairs[0]]
            })
    return not_intercorrelated_independent['Variable'].unique().tolist() + list(data.select_dtypes(exclude='number').columns)

### Scaling

In [17]:
def scale_data(data_train, data_test=None):
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(data_train)
    scaled_train_df = pd.DataFrame(scaled_train, columns=data_train.columns, index=data_train.index)

    if data_test is not None:
        scaled_test = scaler.transform(data_test)
        scaled_test_df = pd.DataFrame(scaled_test, columns=data_test.columns, index=data_test.index)
        return scaled_train_df, scaled_test_df
    else:
        return scaled_train_df


In [18]:
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

def vif_selection(data, threshold=5.0):
    non_numeric_cols = data.select_dtypes(exclude='number').columns.tolist()
    X = data.select_dtypes(include='number').copy()
    features_to_keep = list(X.columns)
    vif_data = calculate_vif(X)
    
    while vif_data['VIF'].max() > threshold:
        max_vif_feature = vif_data.loc[vif_data['VIF'].idxmax(), 'Feature']
        
        X = X.drop(max_vif_feature, axis=1)
        features_to_keep.remove(max_vif_feature)
        vif_data = calculate_vif(X)
        
        print(f'Removed feature: {max_vif_feature} with VIF: {vif_data["VIF"].max()}')
        print(f'Remaining features: {len(X.columns)}')
    
    final_vif_data = pd.DataFrame({
        'Feature': X.columns,
        'VIF': vif_data['VIF']
    })
    
    final_features = final_vif_data['Feature'].tolist() + non_numeric_cols
    
    return final_features


# Train and Test split

In [20]:
# linear regression data:
lr_data = fill_num_columns(data, 'Life expectancy')
lr_data = outlier_treat(lr_data)
lr_data = convert_country(lr_data)
lr_data = feature_engineering(lr_data)
lr_y = lr_data['Life expectancy'] # for all target
lr_data = lr_data[target_correlation(lr_data, 'Life expectancy')]
lr_data = lr_data[intercorrelation(lr_data, 'Life expectancy')]
svr_data = lr_data.copy() # the same steps
# lr_data = lr_data[vif_selection(lr_data, 7)] # it take so much time that's why I didn't run
lr_data = cat_to_num(lr_data)
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(lr_data, lr_y, random_state=42, test_size = 0.2)
X_train_lr, X_test_lr = scale_data(X_train_lr, X_test_lr)

Processing: mean, Grouping: ('Continent',)
Processing: mean, Grouping: ('Year',)
Processing: mean, Grouping: ('Status',)
Processing: mean, Grouping: ('Continent', 'Year')
Processing: mean, Grouping: ('Continent', 'Status')
Processing: mean, Grouping: ('Status', 'Year')
Processing: median, Grouping: ('Continent',)
Processing: median, Grouping: ('Year',)
Processing: median, Grouping: ('Status',)
Processing: median, Grouping: ('Continent', 'Year')
Processing: median, Grouping: ('Continent', 'Status')
Processing: median, Grouping: ('Status', 'Year')
Processing: min, Grouping: ('Continent',)
Processing: min, Grouping: ('Year',)
Processing: min, Grouping: ('Status',)
Processing: min, Grouping: ('Continent', 'Year')
Processing: min, Grouping: ('Continent', 'Status')
Processing: min, Grouping: ('Status', 'Year')
Processing: max, Grouping: ('Continent',)
Processing: max, Grouping: ('Year',)


  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(li

Processing: max, Grouping: ('Status',)
Processing: max, Grouping: ('Continent', 'Year')
Processing: max, Grouping: ('Continent', 'Status')
Processing: max, Grouping: ('Status', 'Year')
Processing: std, Grouping: ('Continent',)
Processing: std, Grouping: ('Year',)
Processing: std, Grouping: ('Status',)
Processing: std, Grouping: ('Continent', 'Year')
Processing: std, Grouping: ('Continent', 'Status')
Processing: std, Grouping: ('Status', 'Year')
Processing: var, Grouping: ('Continent',)
Processing: var, Grouping: ('Year',)


  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(li

Processing: var, Grouping: ('Status',)
Processing: var, Grouping: ('Continent', 'Year')
Processing: var, Grouping: ('Continent', 'Status')
Processing: var, Grouping: ('Status', 'Year')


  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()


In [21]:
X_train_lr.head()

Unnamed: 0,Adult Mortality,Infant deaths,BMI,Under-five deaths,Polio,Diphtheria,HIV/AIDS,GDP,Thinness 1-19 years,Thinness 5-9 years,Income composition of resources,Schooling,Adult Mortality_mean_by_Continent,Infant deaths_mean_by_Continent,Percentage expenditure_mean_by_Continent,Hepatitis B_mean_by_Continent,Measles_mean_by_Continent,BMI_mean_by_Continent,Under-five deaths_mean_by_Continent,Polio_mean_by_Continent,Diphtheria_mean_by_Continent,HIV/AIDS_mean_by_Continent,GDP_mean_by_Continent,Thinness 1-19 years_mean_by_Continent,Thinness 5-9 years_mean_by_Continent,Income composition of resources_mean_by_Continent,Schooling_mean_by_Continent,Life expectancy_mean_by_Continent,Adult Mortality_mean_by_Status,Infant deaths_mean_by_Status,Alcohol_mean_by_Status,Percentage expenditure_mean_by_Status,Hepatitis B_mean_by_Status,Measles_mean_by_Status,BMI_mean_by_Status,Under-five deaths_mean_by_Status,Polio_mean_by_Status,Total expenditure_mean_by_Status,Diphtheria_mean_by_Status,HIV/AIDS_mean_by_Status,GDP_mean_by_Status,Population_mean_by_Status,Thinness 1-19 years_mean_by_Status,Thinness 5-9 years_mean_by_Status,Income composition of resources_mean_by_Status,Schooling_mean_by_Status,Life expectancy_mean_by_Status,Adult Mortality_mean_by_Continent_Year,Infant deaths_mean_by_Continent_Year,Percentage expenditure_mean_by_Continent_Year,Hepatitis B_mean_by_Continent_Year,BMI_mean_by_Continent_Year,Under-five deaths_mean_by_Continent_Year,Polio_mean_by_Continent_Year,Diphtheria_mean_by_Continent_Year,HIV/AIDS_mean_by_Continent_Year,GDP_mean_by_Continent_Year,Thinness 1-19 years_mean_by_Continent_Year,Thinness 5-9 years_mean_by_Continent_Year,Income composition of resources_mean_by_Continent_Year,Schooling_mean_by_Continent_Year,Life expectancy_mean_by_Continent_Year,Adult Mortality_mean_by_Continent_Status,Infant deaths_mean_by_Continent_Status,Alcohol_mean_by_Continent_Status,Percentage expenditure_mean_by_Continent_Status,Hepatitis B_mean_by_Continent_Status,BMI_mean_by_Continent_Status,Under-five deaths_mean_by_Continent_Status,Polio_mean_by_Continent_Status,Total expenditure_mean_by_Continent_Status,Diphtheria_mean_by_Continent_Status,HIV/AIDS_mean_by_Continent_Status,GDP_mean_by_Continent_Status,Thinness 1-19 years_mean_by_Continent_Status,Thinness 5-9 years_mean_by_Continent_Status,Income composition of resources_mean_by_Continent_Status,Schooling_mean_by_Continent_Status,Life expectancy_mean_by_Continent_Status,Adult Mortality_mean_by_Status_Year,Infant deaths_mean_by_Status_Year,BMI_mean_by_Status_Year,Under-five deaths_mean_by_Status_Year,Diphtheria_mean_by_Status_Year,HIV/AIDS_mean_by_Status_Year,GDP_mean_by_Status_Year,Thinness 1-19 years_mean_by_Status_Year,Thinness 5-9 years_mean_by_Status_Year,Income composition of resources_mean_by_Status_Year,Schooling_mean_by_Status_Year,Life expectancy_mean_by_Status_Year,Adult Mortality_median_by_Continent,Infant deaths_median_by_Continent,Percentage expenditure_median_by_Continent,Hepatitis B_median_by_Continent,Measles_median_by_Continent,BMI_median_by_Continent,Under-five deaths_median_by_Continent,Polio_median_by_Continent,Diphtheria_median_by_Continent,HIV/AIDS_median_by_Continent,GDP_median_by_Continent,Thinness 1-19 years_median_by_Continent,Thinness 5-9 years_median_by_Continent,Income composition of resources_median_by_Continent,Schooling_median_by_Continent,Life expectancy_median_by_Continent,Adult Mortality_median_by_Status,Infant deaths_median_by_Status,Alcohol_median_by_Status,Percentage expenditure_median_by_Status,Hepatitis B_median_by_Status,Measles_median_by_Status,BMI_median_by_Status,Under-five deaths_median_by_Status,Polio_median_by_Status,Total expenditure_median_by_Status,Diphtheria_median_by_Status,GDP_median_by_Status,Population_median_by_Status,Thinness 1-19 years_median_by_Status,Thinness 5-9 years_median_by_Status,Income composition of resources_median_by_Status,Schooling_median_by_Status,Life expectancy_median_by_Status,Adult Mortality_median_by_Continent_Year,Infant deaths_median_by_Continent_Year,Percentage expenditure_median_by_Continent_Year,Hepatitis B_median_by_Continent_Year,Measles_median_by_Continent_Year,BMI_median_by_Continent_Year,Under-five deaths_median_by_Continent_Year,Polio_median_by_Continent_Year,Diphtheria_median_by_Continent_Year,HIV/AIDS_median_by_Continent_Year,GDP_median_by_Continent_Year,Thinness 1-19 years_median_by_Continent_Year,Thinness 5-9 years_median_by_Continent_Year,Income composition of resources_median_by_Continent_Year,Schooling_median_by_Continent_Year,Life expectancy_median_by_Continent_Year,Adult Mortality_median_by_Continent_Status,Infant deaths_median_by_Continent_Status,Alcohol_median_by_Continent_Status,Percentage expenditure_median_by_Continent_Status,Measles_median_by_Continent_Status,BMI_median_by_Continent_Status,Under-five deaths_median_by_Continent_Status,Polio_median_by_Continent_Status,Total expenditure_median_by_Continent_Status,Diphtheria_median_by_Continent_Status,HIV/AIDS_median_by_Continent_Status,GDP_median_by_Continent_Status,Thinness 1-19 years_median_by_Continent_Status,Thinness 5-9 years_median_by_Continent_Status,Income composition of resources_median_by_Continent_Status,Schooling_median_by_Continent_Status,Life expectancy_median_by_Continent_Status,Adult Mortality_median_by_Status_Year,Infant deaths_median_by_Status_Year,Under-five deaths_median_by_Status_Year,Polio_median_by_Status_Year,Total expenditure_median_by_Status_Year,Diphtheria_median_by_Status_Year,GDP_median_by_Status_Year,Income composition of resources_median_by_Status_Year,Schooling_median_by_Status_Year,Life expectancy_median_by_Status_Year,Thinness 1-19 years_min_by_Continent,Thinness 5-9 years_min_by_Continent,Life expectancy_min_by_Continent,BMI_min_by_Status,Total expenditure_min_by_Status,GDP_min_by_Status,Population_min_by_Status,Thinness 1-19 years_min_by_Status,Thinness 5-9 years_min_by_Status,Income composition of resources_min_by_Status,Schooling_min_by_Status,Life expectancy_min_by_Status,BMI_min_by_Continent_Year,Thinness 1-19 years_min_by_Continent_Year,Thinness 5-9 years_min_by_Continent_Year,Schooling_min_by_Continent_Year,Life expectancy_min_by_Continent_Year,BMI_min_by_Continent_Status,Thinness 1-19 years_min_by_Continent_Status,Income composition of resources_min_by_Continent_Status,Schooling_min_by_Continent_Status,Life expectancy_min_by_Continent_Status,Alcohol_min_by_Status_Year,Thinness 1-19 years_min_by_Status_Year,Thinness 5-9 years_min_by_Status_Year,Income composition of resources_min_by_Status_Year,Schooling_min_by_Status_Year,Life expectancy_min_by_Status_Year,Alcohol_max_by_Continent,Under-five deaths_max_by_Continent,HIV/AIDS_max_by_Continent,Thinness 1-19 years_max_by_Continent,Thinness 5-9 years_max_by_Continent,Income composition of resources_max_by_Continent,Schooling_max_by_Continent,Life expectancy_max_by_Continent,Adult Mortality_max_by_Status,Infant deaths_max_by_Status,Alcohol_max_by_Status,BMI_max_by_Status,Under-five deaths_max_by_Status,HIV/AIDS_max_by_Status,Thinness 1-19 years_max_by_Status,Thinness 5-9 years_max_by_Status,Income composition of resources_max_by_Status,Schooling_max_by_Status,Adult Mortality_max_by_Continent_Year,Infant deaths_max_by_Continent_Year,BMI_max_by_Continent_Year,Under-five deaths_max_by_Continent_Year,HIV/AIDS_max_by_Continent_Year,Thinness 5-9 years_max_by_Continent_Year,Income composition of resources_max_by_Continent_Year,Schooling_max_by_Continent_Year,Life expectancy_max_by_Continent_Year,Adult Mortality_max_by_Continent_Status,Infant deaths_max_by_Continent_Status,Under-five deaths_max_by_Continent_Status,HIV/AIDS_max_by_Continent_Status,Thinness 1-19 years_max_by_Continent_Status,Thinness 5-9 years_max_by_Continent_Status,Income composition of resources_max_by_Continent_Status,Schooling_max_by_Continent_Status,Life expectancy_max_by_Continent_Status,Adult Mortality_max_by_Status_Year,Infant deaths_max_by_Status_Year,Under-five deaths_max_by_Status_Year,HIV/AIDS_max_by_Status_Year,Thinness 1-19 years_max_by_Status_Year,Thinness 5-9 years_max_by_Status_Year,Income composition of resources_max_by_Status_Year,Adult Mortality_std_by_Continent,Infant deaths_std_by_Continent,Percentage expenditure_std_by_Continent,Under-five deaths_std_by_Continent,Polio_std_by_Continent,Diphtheria_std_by_Continent,HIV/AIDS_std_by_Continent,GDP_std_by_Continent,Schooling_std_by_Continent,Life expectancy_std_by_Continent,Adult Mortality_std_by_Status,Infant deaths_std_by_Status,Alcohol_std_by_Status,Percentage expenditure_std_by_Status,Hepatitis B_std_by_Status,Measles_std_by_Status,BMI_std_by_Status,Under-five deaths_std_by_Status,Polio_std_by_Status,Total expenditure_std_by_Status,Diphtheria_std_by_Status,HIV/AIDS_std_by_Status,GDP_std_by_Status,Population_std_by_Status,Thinness 1-19 years_std_by_Status,Thinness 5-9 years_std_by_Status,Income composition of resources_std_by_Status,Schooling_std_by_Status,Life expectancy_std_by_Status,Adult Mortality_std_by_Continent_Year,Infant deaths_std_by_Continent_Year,Percentage expenditure_std_by_Continent_Year,Under-five deaths_std_by_Continent_Year,Polio_std_by_Continent_Year,Diphtheria_std_by_Continent_Year,HIV/AIDS_std_by_Continent_Year,GDP_std_by_Continent_Year,Schooling_std_by_Continent_Year,Life expectancy_std_by_Continent_Year,Adult Mortality_std_by_Continent_Status,Infant deaths_std_by_Continent_Status,Percentage expenditure_std_by_Continent_Status,Under-five deaths_std_by_Continent_Status,Polio_std_by_Continent_Status,Diphtheria_std_by_Continent_Status,HIV/AIDS_std_by_Continent_Status,GDP_std_by_Continent_Status,Thinness 1-19 years_std_by_Continent_Status,Thinness 5-9 years_std_by_Continent_Status,Schooling_std_by_Continent_Status,Life expectancy_std_by_Continent_Status,Adult Mortality_std_by_Status_Year,Infant deaths_std_by_Status_Year,Under-five deaths_std_by_Status_Year,HIV/AIDS_std_by_Status_Year,Thinness 1-19 years_std_by_Status_Year,Thinness 5-9 years_std_by_Status_Year,Income composition of resources_std_by_Status_Year,Schooling_std_by_Status_Year,Life expectancy_std_by_Status_Year,Adult Mortality_var_by_Continent,Infant deaths_var_by_Continent,Percentage expenditure_var_by_Continent,Under-five deaths_var_by_Continent,Polio_var_by_Continent,Diphtheria_var_by_Continent,HIV/AIDS_var_by_Continent,GDP_var_by_Continent,Schooling_var_by_Continent,Life expectancy_var_by_Continent,Adult Mortality_var_by_Status,Infant deaths_var_by_Status,Alcohol_var_by_Status,Percentage expenditure_var_by_Status,Hepatitis B_var_by_Status,Measles_var_by_Status,BMI_var_by_Status,Under-five deaths_var_by_Status,Polio_var_by_Status,Total expenditure_var_by_Status,Diphtheria_var_by_Status,HIV/AIDS_var_by_Status,GDP_var_by_Status,Population_var_by_Status,Thinness 1-19 years_var_by_Status,Thinness 5-9 years_var_by_Status,Income composition of resources_var_by_Status,Schooling_var_by_Status,Life expectancy_var_by_Status,Adult Mortality_var_by_Continent_Year,Infant deaths_var_by_Continent_Year,Percentage expenditure_var_by_Continent_Year,Under-five deaths_var_by_Continent_Year,Polio_var_by_Continent_Year,Diphtheria_var_by_Continent_Year,HIV/AIDS_var_by_Continent_Year,GDP_var_by_Continent_Year,Schooling_var_by_Continent_Year,Life expectancy_var_by_Continent_Year,Adult Mortality_var_by_Continent_Status,Infant deaths_var_by_Continent_Status,Percentage expenditure_var_by_Continent_Status,Under-five deaths_var_by_Continent_Status,Polio_var_by_Continent_Status,Diphtheria_var_by_Continent_Status,HIV/AIDS_var_by_Continent_Status,GDP_var_by_Continent_Status,Thinness 1-19 years_var_by_Continent_Status,Thinness 5-9 years_var_by_Continent_Status,Schooling_var_by_Continent_Status,Life expectancy_var_by_Continent_Status,Adult Mortality_var_by_Status_Year,Infant deaths_var_by_Status_Year,Under-five deaths_var_by_Status_Year,HIV/AIDS_var_by_Status_Year,Thinness 1-19 years_var_by_Status_Year,Thinness 5-9 years_var_by_Status_Year,Income composition of resources_var_by_Status_Year,Schooling_var_by_Status_Year,Life expectancy_var_by_Status_Year,Status_Developing,Continent_AS,Continent_EU,Continent_NA,Continent_OC,Continent_SA,Continent_unknown
2259,-0.322716,-0.664027,1.022144,-0.677033,0.4975,0.378571,-0.557952,0.106161,-0.575131,-0.577391,0.0,0.780158,-1.009534,-1.221321,1.599007,0.864524,-0.660611,1.228272,-1.187609,1.316923,1.212033,-0.650264,1.663301,-1.236331,-1.23546,0.0,1.38301,1.127284,0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.457259,-0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.0,-0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,-0.457259,-1.16483,-1.296108,2.144431,0.950197,1.012069,-1.176976,1.322026,1.315811,-0.650264,2.243988,-1.25391,-1.218587,0.0,1.615582,1.369186,-0.548516,-1.067224,1.114823,0.504621,0.275935,1.198934,-1.076384,1.031597,0.578075,1.030822,-0.650264,0.547482,-0.871007,-0.871007,0.0,1.081281,0.714915,0.063265,0.331052,-0.097636,0.320463,0.258804,0.0,0.114235,0.457259,0.457259,0.0,0.219338,-0.150506,-1.142265,-0.936679,1.804087,0.58985,-0.989987,1.247217,-0.860364,0.923437,0.975287,-0.650264,1.564593,-1.06336,-1.06336,0.0,1.599306,1.024193,0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.457259,-0.457259,0.457259,-0.457259,-0.457259,-0.457259,-0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,-0.457259,-1.16719,-0.933501,2.644484,1.103297,-0.513463,1.288985,-0.860133,0.807997,0.813404,-0.650264,2.752425,-1.078286,-1.059242,0.0,1.62083,1.265909,-0.714971,-0.955548,1.246427,-0.110143,-0.995729,1.166833,-0.822767,1.052743,0.583747,0.942391,-0.650264,0.433622,-0.782591,-0.720856,0.0,1.095145,0.671458,0.091645,0.086084,-0.014711,0.566209,-0.426975,0.643534,0.147055,0.0,0.105197,-0.358859,-0.650264,-1.066237,1.322448,-0.457259,-0.457259,-0.457259,-0.457259,0.0,0.0,0.0,-0.457259,-0.457259,-0.057073,-0.694463,-1.110226,1.577412,1.382806,1.329565,-1.035648,0.0,-0.561188,0.932068,-0.410736,0.0,0.0,0.0,-0.457259,-0.139286,1.5631,-1.506173,0.181888,-1.253273,-1.298624,0.0,1.581238,0.977497,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,-0.918342,-1.588365,0.875776,-1.581806,-1.164533,-1.264641,0.0,1.298485,1.456109,-0.248354,-0.942982,-1.001028,0.507728,-1.311791,-1.350671,0.0,1.056056,1.28194,0.454901,0.45595,0.456146,0.457259,0.456669,0.456401,0.0,-0.758784,-1.486211,1.204581,-1.437893,-1.726927,-1.495897,0.0,1.268699,0.100605,-0.909368,0.457259,0.457259,0.457259,-0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.0,0.457259,0.0,-0.457259,0.457259,0.457259,0.457259,0.0,0.457259,0.457259,-1.020029,-1.511443,0.831998,-1.517735,-2.084801,-2.098281,0.0,0.677758,-0.942012,-0.907228,-0.123777,-1.135524,0.906328,-1.117843,-0.789168,-0.47937,0.0,0.973712,-1.232552,-1.232552,0.56441,-0.057468,0.1331,0.360927,0.339734,0.0,0.115177,0.115177,0.0,0.081997,-0.188336,-0.743834,-1.311587,1.432997,-1.277734,-1.58346,-1.348834,0.0,1.38474,-0.509891,-0.829723,0.457259,0.457259,0.457259,-0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,-0.457259,0.457259,0.0,-0.457259,0.0,0.457259,0.457259,0.0,0.457259,0.457259,-0.871594,-1.331627,0.879144,-1.291657,-1.689134,-1.75159,0.0,0.599881,-0.927344,-0.966706,-0.28478,-1.182053,0.938621,-1.169337,-0.926053,-0.573116,0.0,0.991431,-0.879825,-0.880882,0.270291,-0.436995,0.015065,0.312431,0.283479,0.0,0.24594,0.205386,0.0,-0.190246,-0.16633,0.457259,-0.554003,1.926726,-0.360227,-0.233409,-0.24635,-0.181888
1677,0.135555,-0.71607,-0.58138,-0.716555,0.18476,0.190928,-0.557952,-0.199082,0.682532,0.675388,0.0,0.140519,1.502986,1.291535,-1.051502,-1.302602,0.913211,-1.273806,1.317881,-1.354505,-1.351663,1.537837,-1.134948,1.086563,0.926787,0.0,-1.283719,-1.470163,0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.457259,-0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.0,-0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,-0.457259,1.558386,1.358147,-1.117368,-1.217597,-1.388493,1.456307,-1.945923,-2.018314,1.537837,-1.282261,1.161435,1.083279,0.0,-1.658318,-1.840925,1.452646,1.247696,-0.621389,-0.888143,-1.153167,-1.260642,1.290454,-1.339765,-0.337308,-1.308104,1.537837,-0.950293,0.951632,0.951632,0.0,-1.151053,-1.416802,0.60047,0.686786,-0.701365,0.715893,-1.154062,0.0,-0.768782,0.457259,0.457259,0.0,-0.975289,-0.772537,1.469762,1.485802,-0.773497,-1.401724,1.097358,-1.209225,1.510764,-1.431436,-1.389687,1.537837,-1.055981,1.227004,1.227004,0.0,-1.170407,-1.493425,0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.457259,-0.457259,0.457259,-0.457259,-0.457259,-0.457259,-0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,-0.457259,1.577633,1.588368,-0.746684,-1.176011,3.504329,-1.331276,1.833126,-1.986347,-2.281368,1.537837,-1.098317,1.294787,1.283662,0.0,-1.732089,-1.847001,1.444327,1.450271,-0.758078,-0.619546,1.019259,-1.208172,1.494341,-1.387734,-0.305158,-1.297588,1.537837,-0.847141,1.237722,1.251393,0.0,-1.136305,-1.448056,0.593447,0.942174,0.674358,-1.481536,-0.426975,-1.654351,-0.722441,0.0,-0.546581,-0.825387,1.537837,0.937878,-1.074559,-0.457259,-0.457259,-0.457259,-0.457259,0.0,0.0,0.0,-0.457259,-0.457259,-0.425806,1.439962,0.900717,-0.977933,-1.582448,-0.611733,0.965579,0.0,-0.561188,-1.016953,-0.410736,0.0,0.0,0.0,-0.457259,-0.640684,-0.693854,0.66688,0.181888,0.902343,0.895588,0.0,-0.712275,-1.488356,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,1.422418,0.767161,-1.768569,0.787867,0.858713,1.180088,0.0,-0.565285,-0.836537,1.064413,0.649799,0.676517,0.507728,0.919707,0.911906,0.0,-0.512788,-1.248233,0.454901,0.45595,0.456146,0.457259,0.456669,0.456401,0.0,1.510746,0.910883,-1.133275,0.9588,1.034364,1.198789,0.0,-1.226632,0.100605,1.35551,0.457259,0.457259,0.457259,-0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.0,0.457259,0.0,-0.457259,0.457259,0.457259,0.457259,0.0,0.457259,0.457259,1.761482,0.906018,-1.395761,1.049197,1.193843,0.99105,0.0,-1.303954,0.523896,1.782301,1.431317,0.936512,-0.975252,0.975164,0.994593,1.026242,0.0,-0.944745,0.12939,0.12939,0.56441,1.330056,0.73474,0.530358,0.588377,0.0,0.842254,0.842254,0.0,0.081997,0.786431,1.523812,1.005134,-1.206554,1.068444,1.191949,1.287729,0.0,-1.179214,0.678727,1.424321,0.457259,0.457259,0.457259,-0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,-0.457259,0.457259,0.0,-0.457259,0.0,0.457259,0.457259,0.0,0.457259,0.457259,1.861898,1.051578,-1.358203,1.151274,1.264733,1.166058,0.0,-1.21132,0.948705,1.854762,1.497009,1.008092,-1.058174,1.067955,1.136654,1.230898,0.0,-0.988155,-0.11823,-0.124364,1.292943,1.478621,0.827309,0.608766,0.650792,0.0,0.752529,0.865211,0.0,1.101219,1.004353,0.457259,-0.554003,-0.519015,-0.360227,-0.233409,-0.24635,-0.181888
2775,1.838937,2.146298,-0.932151,2.049966,0.247308,0.065833,1.79227,0.435257,0.682532,0.675388,0.0,-0.179301,1.502986,1.291535,-1.051502,-1.302602,0.913211,-1.273806,1.317881,-1.354505,-1.351663,1.537837,-1.134948,1.086563,0.926787,0.0,-1.283719,-1.470163,0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.457259,-0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.0,-0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,-0.457259,1.789167,1.259841,-0.781812,-0.735865,-1.140159,1.313967,-0.769461,-0.607723,1.537837,-0.955336,1.161435,1.083279,0.0,-1.190618,-1.305906,1.452646,1.247696,-0.621389,-0.888143,-1.153167,-1.260642,1.290454,-1.339765,-0.337308,-1.308104,1.537837,-0.950293,0.951632,0.951632,0.0,-1.151053,-1.416802,0.779539,0.508919,-0.550432,0.452273,0.056966,0.0,-0.370754,0.457259,0.457259,0.0,-0.377976,-0.357849,1.469762,1.485802,-0.773497,-1.401724,1.097358,-1.209225,1.510764,-1.431436,-1.389687,1.537837,-1.055981,1.227004,1.227004,0.0,-1.170407,-1.493425,0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.457259,-0.457259,0.457259,-0.457259,-0.457259,-0.457259,-0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,-0.457259,1.551738,1.487493,-0.595875,-1.013203,-0.42378,-1.134757,1.457322,-0.868609,-0.680624,1.537837,-0.921855,1.294787,1.283662,0.0,-1.253101,-1.385829,1.444327,1.450271,-0.758078,-0.619546,1.019259,-1.208172,1.494341,-1.387734,-0.305158,-1.297588,1.537837,-0.847141,1.237722,1.251393,0.0,-1.136305,-1.448056,0.56557,0.514129,0.329823,0.273674,-0.426975,0.132893,-0.530921,0.0,-0.546581,-0.592123,1.537837,0.937878,-1.074559,-0.457259,-0.457259,-0.457259,-0.457259,0.0,0.0,0.0,-0.457259,-0.457259,-0.425806,1.439962,0.900717,-0.977933,-1.345228,-0.611733,0.965579,0.0,-0.561188,-1.016953,-0.410736,0.0,0.0,0.0,-0.457259,-0.440125,-0.693854,0.66688,0.181888,0.902343,0.895588,0.0,-0.712275,-1.488356,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,1.422418,0.767161,-1.013042,0.787867,0.858713,0.365178,0.0,-1.186542,-1.461805,1.064413,0.649799,0.676517,0.507728,0.919707,0.911906,0.0,-0.512788,-1.248233,0.454901,0.45595,0.456146,0.457259,0.456669,0.456401,0.0,1.510746,0.910883,-1.133275,0.9588,1.034364,1.198789,0.0,-1.226632,0.100605,1.35551,0.457259,0.457259,0.457259,-0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.0,0.457259,0.0,-0.457259,0.457259,0.457259,0.457259,0.0,0.457259,0.457259,1.029506,0.906018,-0.632158,0.954125,0.491277,0.278127,0.0,-0.584419,0.523896,1.109919,1.431317,0.936512,-0.975252,0.975164,0.994593,1.026242,0.0,-0.944745,0.12939,0.12939,0.56441,1.330056,0.239272,0.530358,0.464056,0.0,0.115177,0.115177,0.0,0.081997,0.299048,1.523812,1.005134,-1.206554,1.068444,1.191949,1.287729,0.0,-1.179214,0.678727,1.424321,0.457259,0.457259,0.457259,-0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,-0.457259,0.457259,0.0,-0.457259,0.0,0.457259,0.457259,0.0,0.457259,0.457259,0.905349,1.006135,-0.890581,1.054958,0.49248,0.285819,0.0,-0.707097,0.323355,1.048628,1.497009,1.008092,-1.058174,1.067955,1.136654,1.230898,0.0,-0.988155,-0.11823,-0.124364,1.292943,1.478621,0.130719,0.457153,0.447598,0.0,0.414803,0.370342,0.0,-0.190246,0.345844,0.457259,-0.554003,-0.519015,-0.360227,-0.233409,-0.24635,-0.181888
2503,-0.876099,-0.71607,0.921924,-0.716555,0.81024,0.816405,-0.557952,2.295768,-0.826664,-0.827947,0.0,1.099978,-1.009534,-1.221321,1.599007,0.864524,-0.660611,1.228272,-1.187609,1.316923,1.212033,-0.650264,1.663301,-1.236331,-1.23546,0.0,1.38301,1.127284,-2.186942,-2.186942,2.186942,2.186942,2.186942,-2.186942,2.186942,-2.186942,2.186942,2.186942,2.186942,0.0,2.186942,-2.186942,-2.186942,-2.186942,0.0,2.186942,2.186942,-0.918663,-1.197802,1.606555,1.672794,1.508737,-1.176976,1.322026,0.931104,-0.650264,2.240146,-1.25391,-1.218587,0.0,1.615582,1.101677,-1.205385,-1.260134,1.809308,1.787562,1.133396,1.283747,-1.215609,1.310581,1.493458,1.30599,-0.650264,1.827043,-1.235534,-1.235534,0.0,1.527748,1.247844,-2.162301,-2.159088,2.619146,-2.183923,1.873509,0.0,3.079335,-2.186942,-2.186942,0.0,2.011279,1.922931,-1.142265,-0.936679,1.804087,0.58985,-0.989987,1.247217,-0.860364,0.923437,0.975287,-0.650264,1.564593,-1.06336,-1.06336,0.0,1.599306,1.024193,-2.186942,-2.186942,2.186942,2.186942,2.186942,-2.186942,2.186942,-2.186942,2.186942,2.186942,2.186942,2.186942,-2.186942,-2.186942,-2.186942,0.0,2.186942,2.186942,-1.037717,-0.933501,1.939349,0.940489,-0.535884,1.223479,-0.860133,0.807997,0.813404,-0.650264,2.777288,-1.078286,-1.059242,0.0,1.62083,1.035323,-1.231613,-0.955548,1.819143,1.991144,-1.010655,1.306539,-0.883743,0.924297,1.472651,0.942391,-0.650264,1.320568,-1.186653,-1.115306,0.0,1.541434,1.170168,-2.277979,-2.05414,-2.081919,1.443814,1.593176,1.409496,4.112074,0.0,2.060532,1.973777,-0.650264,-1.066237,1.322448,2.186942,2.186942,2.186942,2.186942,0.0,0.0,0.0,2.186942,2.186942,-0.057073,-0.694463,-1.110226,1.577412,1.145586,1.329565,-1.035648,0.0,1.723419,1.419323,1.71276,0.0,0.0,0.0,2.186942,2.066861,1.5631,-1.506173,0.181888,-1.253273,-1.298624,0.0,1.581238,0.977497,-2.186942,-2.186942,-2.186942,-2.186942,-2.186942,-2.186942,-2.186942,-2.186942,0.0,2.186942,-0.48604,-1.473461,0.498012,-1.49404,-1.164533,-1.264641,0.0,1.298485,1.664532,-1.222984,-1.811771,-1.801674,-1.96956,-1.125833,-1.162123,0.0,1.840478,1.28194,-1.898451,-2.100863,-2.160411,-2.186942,-2.127751,-2.193197,0.0,-0.758784,-1.486211,1.204581,-1.437893,-1.726927,-1.495897,0.0,1.268699,0.100605,-0.909368,-2.186942,-2.186942,-2.186942,2.186942,-2.186942,-2.186942,-2.186942,-2.186942,-2.186942,0.0,-2.186942,0.0,2.186942,-2.186942,-2.186942,-2.186942,0.0,-2.186942,-2.186942,-0.463727,-1.511443,1.238319,-1.422663,-2.084801,-0.434795,0.0,1.307273,-0.942012,-0.907228,-1.067942,-1.501178,1.370279,-1.481844,-1.808459,-1.734047,0.0,1.346579,-1.232552,-1.232552,-1.354092,-0.75123,-1.7426,-2.180538,-2.146695,0.0,-2.066053,-2.066053,0.0,-2.784211,-2.137869,-0.743834,-1.311587,1.432997,-1.277734,-1.58346,-1.348834,0.0,1.38474,-0.509891,-0.829723,-2.186942,-2.186942,-2.186942,2.186942,-2.186942,-2.186942,-2.186942,-2.186942,-2.186942,2.186942,-2.186942,0.0,2.186942,0.0,-2.186942,-2.186942,0.0,-2.186942,-2.186942,-0.537385,-1.321529,1.569311,-1.28582,-1.679481,-0.653763,0.0,1.452501,-0.927344,-0.678802,-0.952161,-1.276842,1.651732,-1.244492,-1.721818,-1.596742,0.0,1.519191,-0.879825,-0.880882,-1.263687,-0.840283,-1.796644,-2.161621,-2.158765,0.0,-2.118142,-2.104005,0.0,-2.127442,-1.958938,-2.186942,-0.554003,1.926726,-0.360227,-0.233409,-0.24635,-0.181888
1088,0.965629,-0.507897,-0.58138,-0.479424,0.122212,0.12838,1.79227,-0.801856,0.682532,0.675388,0.0,-0.818941,1.502986,1.291535,-1.051502,-1.302602,0.913211,-1.273806,1.317881,-1.354505,-1.351663,1.537837,-1.134948,1.086563,0.926787,0.0,-1.283719,-1.470163,0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.457259,-0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.0,-0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,-0.457259,1.081439,1.06323,-1.408512,-0.976731,-0.809047,1.100458,-0.900179,-0.864194,1.537837,-0.956873,0.758878,0.699635,0.0,-0.722918,-0.904643,1.452646,1.247696,-0.621389,-0.888143,-1.153167,-1.260642,1.290454,-1.339765,-0.337308,-1.308104,1.537837,-0.950293,0.951632,0.951632,0.0,-1.151053,-1.416802,0.191171,0.153184,0.204229,0.188654,0.056966,0.0,-0.173464,0.457259,0.457259,0.0,0.219338,0.056838,1.469762,1.485802,-0.773497,-1.401724,1.097358,-1.209225,1.510764,-1.431436,-1.389687,1.537837,-1.055981,1.227004,1.227004,0.0,-1.170407,-1.493425,0.457259,0.457259,-0.457259,-0.457259,-0.457259,0.457259,-0.457259,0.457259,-0.457259,-0.457259,-0.457259,-0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,-0.457259,1.124478,1.184869,-0.824126,-0.199165,-0.248899,-0.938237,1.018885,-0.533288,-0.253759,1.537837,-0.781307,0.899275,0.893178,0.0,-0.774112,-0.809364,1.444327,1.450271,-0.758078,-0.619546,1.019259,-1.208172,1.494341,-1.387734,-0.305158,-1.297588,1.537837,-0.847141,1.237722,1.251393,0.0,-1.136305,-1.448056,0.258912,-0.341961,-0.014711,-0.018861,-0.426975,0.388214,0.01793,0.0,0.105197,0.107668,1.537837,0.937878,-1.074559,-0.457259,-0.457259,-0.457259,-0.457259,0.0,0.0,0.0,-0.457259,-0.457259,-0.425806,1.439962,0.900717,-0.977933,-0.752177,-0.611733,0.965579,0.0,-0.561188,-1.016953,-0.410736,0.0,0.0,0.0,-0.457259,0.061272,-0.693854,0.66688,0.181888,0.902343,0.895588,0.0,-0.712275,-1.488356,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.0,-0.457259,1.422418,0.767161,0.120249,0.787867,0.858713,-0.042276,0.0,-0.565285,-0.419693,1.064413,0.649799,0.676517,0.507728,0.919707,0.911906,0.0,-0.512788,-1.248233,0.454901,0.45595,0.456146,0.457259,0.456669,0.456401,0.0,1.510746,0.910883,-1.133275,0.9588,1.034364,1.198789,0.0,-1.226632,0.100605,1.35551,0.457259,0.457259,0.457259,-0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,0.0,0.457259,0.0,-0.457259,0.457259,0.457259,0.457259,0.0,0.457259,0.457259,0.824552,0.906018,-2.159364,0.859054,1.193843,1.22869,0.0,-1.251705,0.523896,0.437537,1.431317,0.936512,-0.975252,0.975164,0.994593,1.026242,0.0,-0.944745,0.12939,0.12939,0.56441,1.330056,-0.256196,0.360927,0.339734,0.0,0.115177,0.115177,0.0,0.081997,-0.188336,1.523812,1.005134,-1.206554,1.068444,1.191949,1.287729,0.0,-1.179214,0.678727,1.424321,0.457259,0.457259,0.457259,-0.457259,0.457259,0.457259,0.457259,0.457259,0.457259,-0.457259,0.457259,0.0,-0.457259,0.0,0.457259,0.457259,0.0,0.457259,0.457259,0.678426,0.884955,-1.51658,0.935292,1.235774,1.373755,0.0,-1.180675,0.010681,0.415237,1.497009,1.008092,-1.058174,1.067955,1.136654,1.230898,0.0,-0.988155,-0.11823,-0.124364,1.292943,1.478621,-0.488154,0.21595,0.185789,0.0,0.077077,0.040429,0.0,-0.190246,-0.385833,0.457259,-0.554003,-0.519015,-0.360227,-0.233409,-0.24635,-0.181888


In [23]:
lr_data.head()

Unnamed: 0,Adult Mortality,Infant deaths,BMI,Under-five deaths,Polio,Diphtheria,HIV/AIDS,GDP,Thinness 1-19 years,Thinness 5-9 years,Income composition of resources,Schooling,Adult Mortality_mean_by_Continent,Infant deaths_mean_by_Continent,Percentage expenditure_mean_by_Continent,Hepatitis B_mean_by_Continent,Measles_mean_by_Continent,BMI_mean_by_Continent,Under-five deaths_mean_by_Continent,Polio_mean_by_Continent,Diphtheria_mean_by_Continent,HIV/AIDS_mean_by_Continent,GDP_mean_by_Continent,Thinness 1-19 years_mean_by_Continent,Thinness 5-9 years_mean_by_Continent,Income composition of resources_mean_by_Continent,Schooling_mean_by_Continent,Life expectancy_mean_by_Continent,Adult Mortality_mean_by_Status,Infant deaths_mean_by_Status,Alcohol_mean_by_Status,Percentage expenditure_mean_by_Status,Hepatitis B_mean_by_Status,Measles_mean_by_Status,BMI_mean_by_Status,Under-five deaths_mean_by_Status,Polio_mean_by_Status,Total expenditure_mean_by_Status,Diphtheria_mean_by_Status,HIV/AIDS_mean_by_Status,GDP_mean_by_Status,Population_mean_by_Status,Thinness 1-19 years_mean_by_Status,Thinness 5-9 years_mean_by_Status,Income composition of resources_mean_by_Status,Schooling_mean_by_Status,Life expectancy_mean_by_Status,Adult Mortality_mean_by_Continent_Year,Infant deaths_mean_by_Continent_Year,Percentage expenditure_mean_by_Continent_Year,Hepatitis B_mean_by_Continent_Year,BMI_mean_by_Continent_Year,Under-five deaths_mean_by_Continent_Year,Polio_mean_by_Continent_Year,Diphtheria_mean_by_Continent_Year,HIV/AIDS_mean_by_Continent_Year,GDP_mean_by_Continent_Year,Thinness 1-19 years_mean_by_Continent_Year,Thinness 5-9 years_mean_by_Continent_Year,Income composition of resources_mean_by_Continent_Year,Schooling_mean_by_Continent_Year,Life expectancy_mean_by_Continent_Year,Adult Mortality_mean_by_Continent_Status,Infant deaths_mean_by_Continent_Status,Alcohol_mean_by_Continent_Status,Percentage expenditure_mean_by_Continent_Status,Hepatitis B_mean_by_Continent_Status,BMI_mean_by_Continent_Status,Under-five deaths_mean_by_Continent_Status,Polio_mean_by_Continent_Status,Total expenditure_mean_by_Continent_Status,Diphtheria_mean_by_Continent_Status,HIV/AIDS_mean_by_Continent_Status,GDP_mean_by_Continent_Status,Thinness 1-19 years_mean_by_Continent_Status,Thinness 5-9 years_mean_by_Continent_Status,Income composition of resources_mean_by_Continent_Status,Schooling_mean_by_Continent_Status,Life expectancy_mean_by_Continent_Status,Adult Mortality_mean_by_Status_Year,Infant deaths_mean_by_Status_Year,BMI_mean_by_Status_Year,Under-five deaths_mean_by_Status_Year,Diphtheria_mean_by_Status_Year,HIV/AIDS_mean_by_Status_Year,GDP_mean_by_Status_Year,Thinness 1-19 years_mean_by_Status_Year,Thinness 5-9 years_mean_by_Status_Year,Income composition of resources_mean_by_Status_Year,Schooling_mean_by_Status_Year,Life expectancy_mean_by_Status_Year,Adult Mortality_median_by_Continent,Infant deaths_median_by_Continent,Percentage expenditure_median_by_Continent,Hepatitis B_median_by_Continent,Measles_median_by_Continent,BMI_median_by_Continent,Under-five deaths_median_by_Continent,Polio_median_by_Continent,Diphtheria_median_by_Continent,HIV/AIDS_median_by_Continent,GDP_median_by_Continent,Thinness 1-19 years_median_by_Continent,Thinness 5-9 years_median_by_Continent,Income composition of resources_median_by_Continent,Schooling_median_by_Continent,Life expectancy_median_by_Continent,Adult Mortality_median_by_Status,Infant deaths_median_by_Status,Alcohol_median_by_Status,Percentage expenditure_median_by_Status,Hepatitis B_median_by_Status,Measles_median_by_Status,BMI_median_by_Status,Under-five deaths_median_by_Status,Polio_median_by_Status,Total expenditure_median_by_Status,Diphtheria_median_by_Status,GDP_median_by_Status,Population_median_by_Status,Thinness 1-19 years_median_by_Status,Thinness 5-9 years_median_by_Status,Income composition of resources_median_by_Status,Schooling_median_by_Status,Life expectancy_median_by_Status,Adult Mortality_median_by_Continent_Year,Infant deaths_median_by_Continent_Year,Percentage expenditure_median_by_Continent_Year,Hepatitis B_median_by_Continent_Year,Measles_median_by_Continent_Year,BMI_median_by_Continent_Year,Under-five deaths_median_by_Continent_Year,Polio_median_by_Continent_Year,Diphtheria_median_by_Continent_Year,HIV/AIDS_median_by_Continent_Year,GDP_median_by_Continent_Year,Thinness 1-19 years_median_by_Continent_Year,Thinness 5-9 years_median_by_Continent_Year,Income composition of resources_median_by_Continent_Year,Schooling_median_by_Continent_Year,Life expectancy_median_by_Continent_Year,Adult Mortality_median_by_Continent_Status,Infant deaths_median_by_Continent_Status,Alcohol_median_by_Continent_Status,Percentage expenditure_median_by_Continent_Status,Measles_median_by_Continent_Status,BMI_median_by_Continent_Status,Under-five deaths_median_by_Continent_Status,Polio_median_by_Continent_Status,Total expenditure_median_by_Continent_Status,Diphtheria_median_by_Continent_Status,HIV/AIDS_median_by_Continent_Status,GDP_median_by_Continent_Status,Thinness 1-19 years_median_by_Continent_Status,Thinness 5-9 years_median_by_Continent_Status,Income composition of resources_median_by_Continent_Status,Schooling_median_by_Continent_Status,Life expectancy_median_by_Continent_Status,Adult Mortality_median_by_Status_Year,Infant deaths_median_by_Status_Year,Under-five deaths_median_by_Status_Year,Polio_median_by_Status_Year,Total expenditure_median_by_Status_Year,Diphtheria_median_by_Status_Year,GDP_median_by_Status_Year,Income composition of resources_median_by_Status_Year,Schooling_median_by_Status_Year,Life expectancy_median_by_Status_Year,Thinness 1-19 years_min_by_Continent,Thinness 5-9 years_min_by_Continent,Life expectancy_min_by_Continent,BMI_min_by_Status,Total expenditure_min_by_Status,GDP_min_by_Status,Population_min_by_Status,Thinness 1-19 years_min_by_Status,Thinness 5-9 years_min_by_Status,Income composition of resources_min_by_Status,Schooling_min_by_Status,Life expectancy_min_by_Status,BMI_min_by_Continent_Year,Thinness 1-19 years_min_by_Continent_Year,Thinness 5-9 years_min_by_Continent_Year,Schooling_min_by_Continent_Year,Life expectancy_min_by_Continent_Year,BMI_min_by_Continent_Status,Thinness 1-19 years_min_by_Continent_Status,Income composition of resources_min_by_Continent_Status,Schooling_min_by_Continent_Status,Life expectancy_min_by_Continent_Status,Alcohol_min_by_Status_Year,Thinness 1-19 years_min_by_Status_Year,Thinness 5-9 years_min_by_Status_Year,Income composition of resources_min_by_Status_Year,Schooling_min_by_Status_Year,Life expectancy_min_by_Status_Year,Alcohol_max_by_Continent,Under-five deaths_max_by_Continent,HIV/AIDS_max_by_Continent,Thinness 1-19 years_max_by_Continent,Thinness 5-9 years_max_by_Continent,Income composition of resources_max_by_Continent,Schooling_max_by_Continent,Life expectancy_max_by_Continent,Adult Mortality_max_by_Status,Infant deaths_max_by_Status,Alcohol_max_by_Status,BMI_max_by_Status,Under-five deaths_max_by_Status,HIV/AIDS_max_by_Status,Thinness 1-19 years_max_by_Status,Thinness 5-9 years_max_by_Status,Income composition of resources_max_by_Status,Schooling_max_by_Status,Adult Mortality_max_by_Continent_Year,Infant deaths_max_by_Continent_Year,BMI_max_by_Continent_Year,Under-five deaths_max_by_Continent_Year,HIV/AIDS_max_by_Continent_Year,Thinness 5-9 years_max_by_Continent_Year,Income composition of resources_max_by_Continent_Year,Schooling_max_by_Continent_Year,Life expectancy_max_by_Continent_Year,Adult Mortality_max_by_Continent_Status,Infant deaths_max_by_Continent_Status,Under-five deaths_max_by_Continent_Status,HIV/AIDS_max_by_Continent_Status,Thinness 1-19 years_max_by_Continent_Status,Thinness 5-9 years_max_by_Continent_Status,Income composition of resources_max_by_Continent_Status,Schooling_max_by_Continent_Status,Life expectancy_max_by_Continent_Status,Adult Mortality_max_by_Status_Year,Infant deaths_max_by_Status_Year,Under-five deaths_max_by_Status_Year,HIV/AIDS_max_by_Status_Year,Thinness 1-19 years_max_by_Status_Year,Thinness 5-9 years_max_by_Status_Year,Income composition of resources_max_by_Status_Year,Adult Mortality_std_by_Continent,Infant deaths_std_by_Continent,Percentage expenditure_std_by_Continent,Under-five deaths_std_by_Continent,Polio_std_by_Continent,Diphtheria_std_by_Continent,HIV/AIDS_std_by_Continent,GDP_std_by_Continent,Schooling_std_by_Continent,Life expectancy_std_by_Continent,Adult Mortality_std_by_Status,Infant deaths_std_by_Status,Alcohol_std_by_Status,Percentage expenditure_std_by_Status,Hepatitis B_std_by_Status,Measles_std_by_Status,BMI_std_by_Status,Under-five deaths_std_by_Status,Polio_std_by_Status,Total expenditure_std_by_Status,Diphtheria_std_by_Status,HIV/AIDS_std_by_Status,GDP_std_by_Status,Population_std_by_Status,Thinness 1-19 years_std_by_Status,Thinness 5-9 years_std_by_Status,Income composition of resources_std_by_Status,Schooling_std_by_Status,Life expectancy_std_by_Status,Adult Mortality_std_by_Continent_Year,Infant deaths_std_by_Continent_Year,Percentage expenditure_std_by_Continent_Year,Under-five deaths_std_by_Continent_Year,Polio_std_by_Continent_Year,Diphtheria_std_by_Continent_Year,HIV/AIDS_std_by_Continent_Year,GDP_std_by_Continent_Year,Schooling_std_by_Continent_Year,Life expectancy_std_by_Continent_Year,Adult Mortality_std_by_Continent_Status,Infant deaths_std_by_Continent_Status,Percentage expenditure_std_by_Continent_Status,Under-five deaths_std_by_Continent_Status,Polio_std_by_Continent_Status,Diphtheria_std_by_Continent_Status,HIV/AIDS_std_by_Continent_Status,GDP_std_by_Continent_Status,Thinness 1-19 years_std_by_Continent_Status,Thinness 5-9 years_std_by_Continent_Status,Schooling_std_by_Continent_Status,Life expectancy_std_by_Continent_Status,Adult Mortality_std_by_Status_Year,Infant deaths_std_by_Status_Year,Under-five deaths_std_by_Status_Year,HIV/AIDS_std_by_Status_Year,Thinness 1-19 years_std_by_Status_Year,Thinness 5-9 years_std_by_Status_Year,Income composition of resources_std_by_Status_Year,Schooling_std_by_Status_Year,Life expectancy_std_by_Status_Year,Adult Mortality_var_by_Continent,Infant deaths_var_by_Continent,Percentage expenditure_var_by_Continent,Under-five deaths_var_by_Continent,Polio_var_by_Continent,Diphtheria_var_by_Continent,HIV/AIDS_var_by_Continent,GDP_var_by_Continent,Schooling_var_by_Continent,Life expectancy_var_by_Continent,Adult Mortality_var_by_Status,Infant deaths_var_by_Status,Alcohol_var_by_Status,Percentage expenditure_var_by_Status,Hepatitis B_var_by_Status,Measles_var_by_Status,BMI_var_by_Status,Under-five deaths_var_by_Status,Polio_var_by_Status,Total expenditure_var_by_Status,Diphtheria_var_by_Status,HIV/AIDS_var_by_Status,GDP_var_by_Status,Population_var_by_Status,Thinness 1-19 years_var_by_Status,Thinness 5-9 years_var_by_Status,Income composition of resources_var_by_Status,Schooling_var_by_Status,Life expectancy_var_by_Status,Adult Mortality_var_by_Continent_Year,Infant deaths_var_by_Continent_Year,Percentage expenditure_var_by_Continent_Year,Under-five deaths_var_by_Continent_Year,Polio_var_by_Continent_Year,Diphtheria_var_by_Continent_Year,HIV/AIDS_var_by_Continent_Year,GDP_var_by_Continent_Year,Schooling_var_by_Continent_Year,Life expectancy_var_by_Continent_Year,Adult Mortality_var_by_Continent_Status,Infant deaths_var_by_Continent_Status,Percentage expenditure_var_by_Continent_Status,Under-five deaths_var_by_Continent_Status,Polio_var_by_Continent_Status,Diphtheria_var_by_Continent_Status,HIV/AIDS_var_by_Continent_Status,GDP_var_by_Continent_Status,Thinness 1-19 years_var_by_Continent_Status,Thinness 5-9 years_var_by_Continent_Status,Schooling_var_by_Continent_Status,Life expectancy_var_by_Continent_Status,Adult Mortality_var_by_Status_Year,Infant deaths_var_by_Status_Year,Under-five deaths_var_by_Status_Year,HIV/AIDS_var_by_Status_Year,Thinness 1-19 years_var_by_Status_Year,Thinness 5-9 years_var_by_Status_Year,Income composition of resources_var_by_Status_Year,Schooling_var_by_Status_Year,Life expectancy_var_by_Status_Year,Status_Developing,Continent_AS,Continent_EU,Continent_NA,Continent_OC,Continent_SA,Continent_unknown
0,263,55,19,70,49,65,0,584,15,15,0,10,134,16,252,86,363,34,21,87,87,0,5105,6,7,0,11,71,179,16,3,209,83,257,35,21,83,5,83,0,4005,7860115,5,5,0,11,67,138,14,1,90,39,18,89,90,0,5729,6,6,0,12,73,140,17,1,216,85,34,22,87,4,86,0,4552,7,7,0,11,70,169,14,39,19,85,0,4684,5,5,0,12,69,133,7,63,93,119,32,8,95,94,0,2612,5,5,0,11,72,163,6,2,48,87,18,35,7,91,5,89,1860,3707315,4,4,0,11,69,133,5,0,97,68,37,6,97,97,0,3804,5,5,0,12,74,137,8,1,56,134,32,9,94,4,94,0,2224,5,5,0,11,71,157,4,6,91,5,92,3695,0,12,71,0,1,54,1,0,1,34,0,0,0,4,44,4,1,1,8,64,1,1,0,4,54,0,0,0,0,4,51,13,70,1,15,15,0,16,87,459,55,16,77,70,1,15,15,0,18,293,55,71,70,0,15,0,16,85,321,55,70,1,15,15,0,16,84,459,55,70,1,15,15,0,77,20,371,26,14,13,0,5812,2,5,118,20,3,322,13,369,19,26,16,2,16,0,4626,9004451,3,3,0,2,8,63,19,10,25,15,13,0,5871,1,5,77,20,339,26,14,14,0,5320,4,4,2,5,98,19,25,0,3,3,0,2,7,6082,414,137821,677,205,195,0,33784204,4,33,14041,400,10,104175,187,136435,367,696,272,4,269,0,21400762,-2147483648,15,15,0,7,80,3993,396,115,643,248,181,0,34469907,3,27,6040,424,115198,695,213,202,0,28308095,24,24,4,27,9609,367,630,0,13,13,0,6,56,1,1,0,0,0,0,0
1,271,55,18,70,58,62,0,612,15,15,0,10,134,16,252,86,363,34,21,87,87,0,5105,6,7,0,11,71,179,16,3,209,83,257,35,21,83,5,83,0,4005,7860115,5,5,0,11,67,120,15,305,90,38,18,90,90,0,6401,6,7,0,12,72,140,17,1,216,85,34,22,87,4,86,0,4552,7,7,0,11,70,163,14,38,19,85,0,5119,5,5,0,12,69,133,7,63,93,119,32,8,95,94,0,2612,5,5,0,11,72,163,6,2,48,87,18,35,7,91,5,89,1860,3707315,4,4,0,11,69,125,5,125,96,138,36,7,96,95,0,4305,5,5,0,12,74,137,8,1,56,134,32,9,94,4,94,0,2224,5,5,0,11,71,149,5,6,93,5,92,3687,0,12,71,0,1,54,1,0,1,34,0,0,0,4,44,2,0,1,7,59,1,1,0,4,54,0,0,0,0,4,48,13,70,1,15,15,0,16,87,459,55,16,77,70,1,15,15,0,18,294,55,68,70,0,15,0,16,83,321,55,70,1,15,15,0,16,84,459,55,70,1,15,15,0,77,20,371,26,14,13,0,5812,2,5,118,20,3,322,13,369,19,26,16,2,16,0,4626,9004451,3,3,0,2,8,78,19,395,25,13,11,0,5950,1,5,77,20,339,26,14,14,0,5320,4,4,2,5,107,19,25,0,3,3,0,2,7,6082,414,137821,677,205,195,0,33784204,4,33,14041,400,10,104175,187,136435,367,696,272,4,269,0,21400762,-2147483648,15,15,0,7,80,6162,399,156689,648,179,141,0,35409985,3,29,6040,424,115198,695,213,202,0,28308095,24,24,4,27,11615,372,639,0,13,14,0,6,61,1,1,0,0,0,0,0
2,268,55,18,70,62,64,0,631,15,15,0,9,134,16,252,86,363,34,21,87,87,0,5105,6,7,0,11,71,179,16,3,209,83,257,35,21,83,5,83,0,4005,7860115,5,5,0,11,67,115,15,277,89,35,19,90,90,0,5254,6,6,0,12,72,140,17,1,216,85,34,22,87,4,86,0,4552,7,7,0,11,70,165,15,39,19,86,0,4868,5,5,0,12,69,133,7,63,93,119,32,8,95,94,0,2612,5,5,0,11,72,163,6,2,48,87,18,35,7,91,5,89,1860,3707315,4,4,0,11,69,122,5,70,96,151,32,7,97,96,0,4133,5,5,0,12,74,137,8,1,56,134,32,9,94,4,94,0,2224,5,5,0,11,71,157,5,6,93,5,93,3452,0,12,71,0,1,54,1,0,1,34,0,0,0,4,44,2,0,1,7,59,1,1,0,4,54,0,0,0,0,4,49,13,70,1,15,15,0,16,87,459,55,16,77,70,1,15,15,0,18,268,55,67,70,0,15,0,15,83,321,55,70,1,15,15,0,16,84,459,55,70,1,15,15,0,77,20,371,26,14,13,0,5812,2,5,118,20,3,322,13,369,19,26,16,2,16,0,4626,9004451,3,3,0,2,8,71,20,382,25,13,12,0,5529,1,5,77,20,339,26,14,14,0,5320,4,4,2,5,106,19,25,0,3,3,0,2,7,6082,414,137821,677,205,195,0,33784204,4,33,14041,400,10,104175,187,136435,367,696,272,4,269,0,21400762,-2147483648,15,15,0,7,80,5136,401,146233,653,185,150,0,30580092,3,28,6040,424,115198,695,213,202,0,28308095,24,24,4,27,11416,377,646,0,14,13,0,6,59,1,1,0,0,0,0,0
3,272,55,17,70,67,67,0,669,15,15,0,9,134,16,252,86,363,34,21,87,87,0,5105,6,7,0,11,71,179,16,3,209,83,257,35,21,83,5,83,0,4005,7860115,5,5,0,11,67,106,15,360,88,37,19,89,89,0,6951,6,6,0,12,72,140,17,1,216,85,34,22,87,4,86,0,4552,7,7,0,11,70,164,15,37,20,86,0,5435,5,5,0,12,68,133,7,63,93,119,32,8,95,94,0,2612,5,5,0,11,72,163,6,2,48,87,18,35,7,91,5,89,1860,3707315,4,4,0,11,69,102,6,154,96,37,38,7,96,96,0,6098,5,5,0,12,74,137,8,1,56,134,32,9,94,4,94,0,2224,5,5,0,11,71,151,5,6,93,5,93,4142,0,12,69,0,1,54,1,0,1,34,0,0,0,4,44,3,0,1,7,59,1,1,0,4,54,0,0,0,0,4,49,13,70,1,15,15,0,16,87,459,55,16,77,70,1,15,15,0,18,272,55,69,70,0,15,0,15,83,321,55,70,1,15,15,0,16,84,459,55,70,1,15,15,0,77,20,371,26,14,13,0,5812,2,5,118,20,3,322,13,369,19,26,16,2,16,0,4626,9004451,3,3,0,2,8,75,20,431,25,14,12,0,6281,1,5,77,20,339,26,14,14,0,5320,4,4,2,5,109,19,25,0,3,3,0,2,7,6082,414,137821,677,205,195,0,33784204,4,33,14041,400,10,104175,187,136435,367,696,272,4,269,0,21400762,-2147483648,15,15,0,7,80,5675,404,186157,658,198,164,0,39460477,3,30,6040,424,115198,695,213,202,0,28308095,24,24,4,27,12063,381,655,0,14,14,0,6,62,1,1,0,0,0,0,0
4,275,55,17,70,68,68,0,63,15,15,0,9,134,16,252,86,363,34,21,87,87,0,5105,6,7,0,11,71,179,16,3,209,83,257,35,21,83,5,83,0,4005,7860115,5,5,0,11,67,127,15,329,89,36,19,89,91,0,6049,6,6,0,12,72,140,17,1,216,85,34,22,87,4,86,0,4552,7,7,0,11,70,175,15,36,20,86,0,4525,5,5,0,11,68,133,7,63,93,119,32,8,95,94,0,2612,5,5,0,11,72,163,6,2,48,87,18,35,7,91,5,89,1860,3707315,4,4,0,11,69,127,7,185,95,106,37,7,96,95,0,4589,5,5,0,12,73,137,8,1,56,134,32,9,94,4,94,0,2224,5,5,0,11,71,161,5,7,93,5,93,3526,0,12,71,0,1,54,1,0,1,34,0,0,0,4,44,2,0,1,7,59,1,1,0,4,54,0,0,0,0,4,48,13,70,1,15,15,0,16,87,459,55,16,77,70,1,15,15,0,18,275,55,69,70,0,15,0,15,82,321,55,70,1,15,15,0,16,84,459,55,70,1,15,15,0,77,20,371,26,14,13,0,5812,2,5,118,20,3,322,13,369,19,26,16,2,16,0,4626,9004451,3,3,0,2,8,71,20,387,25,13,8,0,5589,1,5,77,20,339,26,14,14,0,5320,4,4,2,5,110,19,25,0,3,3,0,2,8,6082,414,137821,677,205,195,0,33784204,4,33,14041,400,10,104175,187,136435,367,696,272,4,269,0,21400762,-2147483648,15,15,0,7,80,5064,406,150160,662,191,80,0,31244349,3,27,6040,424,115198,695,213,202,0,28308095,24,24,4,27,12206,386,665,0,13,14,0,6,66,1,1,0,0,0,0,0


In [24]:
# decision tree data and random forest:
dt_data = fill_num_columns(data, 'Life expectancy')
dt_data = convert_country(dt_data)
dt_data = feature_engineering(dt_data)
dt_data = cat_to_num(dt_data)
dt_data = dt_data.drop('Life expectancy', axis=1)
X_train_tr, X_test_tr, y_train_tr, y_test_tr = train_test_split(dt_data, lr_y, random_state=42, test_size = 0.2)

Processing: mean, Grouping: ('Continent',)
Processing: mean, Grouping: ('Year',)
Processing: mean, Grouping: ('Status',)
Processing: mean, Grouping: ('Continent', 'Year')
Processing: mean, Grouping: ('Continent', 'Status')
Processing: mean, Grouping: ('Status', 'Year')
Processing: median, Grouping: ('Continent',)
Processing: median, Grouping: ('Year',)
Processing: median, Grouping: ('Status',)
Processing: median, Grouping: ('Continent', 'Year')
Processing: median, Grouping: ('Continent', 'Status')
Processing: median, Grouping: ('Status', 'Year')
Processing: min, Grouping: ('Continent',)
Processing: min, Grouping: ('Year',)
Processing: min, Grouping: ('Status',)
Processing: min, Grouping: ('Continent', 'Year')
Processing: min, Grouping: ('Continent', 'Status')
Processing: min, Grouping: ('Status', 'Year')
Processing: max, Grouping: ('Continent',)


  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(li

Processing: max, Grouping: ('Year',)
Processing: max, Grouping: ('Status',)
Processing: max, Grouping: ('Continent', 'Year')
Processing: max, Grouping: ('Continent', 'Status')
Processing: max, Grouping: ('Status', 'Year')
Processing: std, Grouping: ('Continent',)
Processing: std, Grouping: ('Year',)
Processing: std, Grouping: ('Status',)
Processing: std, Grouping: ('Continent', 'Year')
Processing: std, Grouping: ('Continent', 'Status')
Processing: std, Grouping: ('Status', 'Year')
Processing: var, Grouping: ('Continent',)
Processing: var, Grouping: ('Year',)


  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(li

Processing: var, Grouping: ('Status',)
Processing: var, Grouping: ('Continent', 'Year')
Processing: var, Grouping: ('Continent', 'Status')
Processing: var, Grouping: ('Status', 'Year')


  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()
  grouped_data = data.groupby(list(group_cols))[calc_cols].agg(stat_func).reset_index()


In [25]:
# svr data:
svr_data = cat_to_num(svr_data)
X_train_svr, X_test_svr, y_train_svr, y_test_svr = train_test_split(svr_data, lr_y, random_state=42, test_size = 0.2)
X_train_svr, X_test_svr = scale_data(X_train_svr, X_test_svr)

In [26]:
# xgb lgbm catboost:
boost_data = convert_country(data)
boost_data = boost_data.dropna(subset = ['Life expectancy'], axis=0)
boost_data_custom = boost_data.copy() # same step for custom data
boost_data = boost_data.select_dtypes(include='number')
boost_data = boost_data.drop('Life expectancy', axis=1)
X_train_boost, X_test_boost, y_train_boost, y_test_boost = train_test_split(boost_data, lr_y, random_state=42, test_size = 0.2)

In [27]:
# custom catboost:
columns_to_fill = boost_data_custom.select_dtypes(exclude='number').columns.tolist()
boost_data_custom[columns_to_fill] = boost_data_custom[columns_to_fill].fillna('Missing Value')
boost_data_custom = boost_data_custom.drop('Life expectancy', axis=1)
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(boost_data_custom, lr_y, random_state=42, test_size = 0.2)

# Importing the Models

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Train and Evaluate

In [29]:
default_models = []

lin_reg_def = LinearRegression()
dec_tree_def = DecisionTreeRegressor()
rfr_def = RandomForestRegressor()
svr_def = SVR()
xgboost_def = XGBRegressor()
lgbm_def = LGBMRegressor()
catboost_def = CatBoostRegressor()
catboost_custom_def = CatBoostRegressor(cat_features=[x for x in X_train_cat.columns if (X_train_cat[x].dtype == object)])

default_models.extend([
    ('LinearRegression', lin_reg_def),
    ('DecisionTreeRegressor', dec_tree_def),
    ('RFR', rfr_def),
    ('SVR', svr_def),
    ('XGB', xgboost_def), 
    ('LightGBM', lgbm_def),
    ('CatBoost', catboost_def),
    ('Custom_CatBoost_Cat', catboost_custom_def)
])

In [30]:
def train_test_and_evaluate_model(model_name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    
    return r2_train, r2_test

In [31]:
def evaluate_models(models):
    r2_df = pd.DataFrame(columns=['Model', 'R2_train', 'R2_test'])
    
    for model_name, model in models:
        if model_name.startswith('LinearRegression'):
            r2_train, r2_test = train_test_and_evaluate_model(model_name, model, X_train_lr, y_train_lr, X_test_lr, y_test_lr)
        
        elif model_name.startswith('DecisionTreeRegressor') or model_name.startswith('RFR'):
            r2_train, r2_test = train_test_and_evaluate_model(model_name, model, X_train_tr, y_train_tr, X_test_tr, y_test_tr)
        
        elif model_name.startswith('Custom_CatBoost_Cat'):
            r2_train, r2_test = train_test_and_evaluate_model(model_name, model, X_train_cat, y_train_cat, X_test_cat, y_test_cat)
        
        elif model_name.startswith('SVR'):
            r2_train, r2_test = train_test_and_evaluate_model(model_name, model, X_train_svr, y_train_svr, X_test_svr, y_test_svr)
        
        elif model_name.startswith('XGB') or model_name.startswith('LightGBM') or model_name.startswith('CatBoost'):
            r2_train, r2_test = train_test_and_evaluate_model(model_name, model, X_train_boost, y_train_boost, X_test_boost, y_test_boost)
        
        if r2_train is not None:
            r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'R2_train': [r2_train], 'R2_test': [r2_test]})], ignore_index=True)
    
    r2_df_sorted = r2_df.sort_values(by='R2_train', ascending=False)
    return r2_df_sorted

In [32]:
default_model_result = evaluate_models(default_models)

  r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'R2_train': [r2_train], 'R2_test': [r2_test]})], ignore_index=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3360
[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 19
[LightGBM] [Info] Start training from score 69.158369
Learning rate set to 0.046835
0:	learn: 9.2080057	total: 147ms	remaining: 2m 27s
1:	learn: 8.8833878	total: 150ms	remaining: 1m 14s
2:	learn: 8.5703469	total: 152ms	remaining: 50.4s
3:	learn: 8.2764824	total: 154ms	remaining: 38.3s
4:	learn: 7.9980043	total: 156ms	remaining: 31s
5:	learn: 7.7281368	total: 158ms	remaining: 26.2s
6:	learn: 7.4786121	total: 161ms	remaining: 22.8s
7:	learn: 7.2256969	total: 163ms	remaining: 20.2s
8:	learn: 6.9992185	total: 165ms	remaining: 18.2s
9:	learn: 6.7745048	total: 168ms	remaining: 16.6s
10:	learn: 6.5687402	total: 170ms	remaining: 15.3s
11:	learn: 6.

In [33]:
default_model_result

Unnamed: 0,Model,R2_train,R2_test
1,DecisionTreeRegressor,1.0,0.913922
4,XGB,0.999538,0.96751
2,RFR,0.994133,0.95891
6,CatBoost,0.993469,0.968466
7,Custom_CatBoost_Cat,0.992841,0.968598
5,LightGBM,0.991372,0.96606
0,LinearRegression,0.856166,0.851753
3,SVR,0.822095,0.823842


- using custom CatBoost doesn't change score because there wasn't any missing value in categorical columns

# Optimization
- #### With Optuna

In [34]:
def best_params_for_model(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100)
    }
    lgb_reg = LGBMRegressor(**param)
    
    r2 = cross_val_score(lgb_reg, X_train_boost, y_train_boost, cv=3, scoring='r2', n_jobs=-1).mean()
    return r2

study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=3)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_lgb_model = LGBMRegressor(**best_params)


[I 2024-05-19 21:53:09,084] A new study created in memory with name: no-name-7a120936-f501-42ec-8074-0268da06f488
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
[I 2024-05-19 21:53:11,211] Trial 0 finished with value: 0.9571454761016334 and parameters: {'n_estimators': 483, 'learning_rate': 0.0965786280281194, 'max_depth': 5, 'num_leaves': 42}. Best is trial 0 with value: 0.9571454761016334.
[I 2024-05-19 21:53:13,633] Trial 1 finished with value: 0.9586920800437649 and parameters: {'n_estimators': 925, 'learning_rate': 0.04016648230665793, 'max_depth': 8, 'num_leaves': 44}. Best is trial 1 with value: 0.9586920800437649.
[I 2024-05-19 21:53:14,938] Trial 2 finished with value: 0.9485183470308307 and parameters: {'n_estimators': 782, 'learning_rate': 0.027410283726266655, 'max_depth': 3, 'num_leaves': 51}. Best is trial 1 with value: 0.9586920800437649.


Best trial:
  Value: 0.959
  Params:  {'n_estimators': 925, 'learning_rate': 0.04016648230665793, 'max_depth': 8, 'num_leaves': 44}


In [35]:
def best_params_for_model(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5,1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5,1),
        'gamma': trial.suggest_int('gamma', 0,5)
        # Increasing the gamma value can help to prevent overfitting
        # Should be used only when you are using high depth
    }
    
    xgb_reg = XGBRegressor(**param)
    r2 = cross_val_score(xgb_reg, X_train_boost, y_train_boost, cv=3, scoring='r2', n_jobs=-1).mean()
    return r2

study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=3)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_xgb_model = XGBRegressor(**best_params)

[I 2024-05-19 21:53:14,957] A new study created in memory with name: no-name-7b79d159-c5bc-4f0b-86a0-7beaa00eb5c5
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5,1),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5,1),
[I 2024-05-19 21:53:15,795] Trial 0 finished with value: 0.9589309784928659 and parameters: {'n_estimators': 911, 'learning_rate': 0.043969998299955695, 'max_depth': 6, 'subsample': 0.5813636005820599, 'colsample_bytree': 0.7261432509435952, 'gamma': 5}. Best is trial 0 with value: 0.9589309784928659.
[I 2024-05-19 21:53:16,608] Trial 1 finished with value: 0.9581739806339766 and parameters: {'n_estimators': 707, 'learning_rate': 0.11863871859427277, 'max_depth': 4, 'subsample': 0.8732948714195585, 'colsample_bytree': 0.5604242040206906, 'gamma': 0}. Best is trial 0 with value: 0.9589309784928659.
[I 2024-05-19 21:53:16,928] Trial 2 finished with value: 0.9452358459931681 an

Best trial:
  Value: 0.959
  Params:  {'n_estimators': 911, 'learning_rate': 0.043969998299955695, 'max_depth': 6, 'subsample': 0.5813636005820599, 'colsample_bytree': 0.7261432509435952, 'gamma': 5}


In [36]:
def best_params_for_model(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),
        'loss_function': trial.suggest_categorical('loss_function', ['RMSE']) # For regression tasks, use ‘RMSE’
    }
    cb_reg = CatBoostRegressor(**param, verbose=0)
    
    r2 = cross_val_score(cb_reg, X_train_boost, y_train_boost, cv=3, scoring='r2', n_jobs=-1).mean()
    return r2

study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=3)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)

best_cb_model = CatBoostRegressor(**best_params)

[I 2024-05-19 21:53:16,940] A new study created in memory with name: no-name-3fcbd718-fe27-48dc-987c-b5c8b228e547
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 10),
[I 2024-05-19 21:53:25,403] Trial 0 finished with value: 0.951026934711844 and parameters: {'iterations': 274, 'learning_rate': 0.0407099380733281, 'depth': 9, 'l2_leaf_reg': 0.617487484310912, 'loss_function': 'RMSE'}. Best is trial 0 with value: 0.951026934711844.
[I 2024-05-19 21:53:32,085] Trial 1 finished with value: 0.9607663828199661 and parameters: {'iterations': 855, 'learning_rate': 0.07415749522830317, 'depth': 7, 'l2_leaf_reg': 0.8557630258855302, 'loss_function': 'RMSE'}. Best is trial 1 with value: 0.9607663828199661.
[I 2024-05-19 21:53:35,339] Trial 2 finished with value: 0.9243572778554999 and parameters: {'iterations': 655, 'learning_rate': 0.9287732116987893, 'depth': 6, 'l2_leaf_reg': 4.940945975985907, 'loss_functio

Best trial:
  Value: 0.961
  Params:  {'iterations': 855, 'learning_rate': 0.07415749522830317, 'depth': 7, 'l2_leaf_reg': 0.8557630258855302, 'loss_function': 'RMSE'}


In [37]:
def objective(trial):    
    params = {
        "n_estimators": trial.suggest_int(name="n_estimators", low=100, high=500, step=100),
        "max_features": trial.suggest_categorical(name="max_features", choices=['log2', 'sqrt']),
        "max_depth": trial.suggest_int(name="max_depth", low=10, high=110, step=20),
        "min_samples_split": trial.suggest_int(name="min_samples_split", low=2, high=10, step=2),
        "min_samples_leaf": trial.suggest_int(name="min_samples_leaf", low=1, high=4, step=1)
    }
    model = RandomForestRegressor(**params)
    
    r2 = cross_val_score(model, X_train_tr, y_train_tr, scoring='r2', n_jobs=-1, cv=5).mean()
    return np.mean(r2)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

best_rfr_model = RandomForestRegressor(**study.best_params)

[I 2024-05-19 21:53:35,350] A new study created in memory with name: no-name-9cd3175c-a8fd-472f-9e35-eba78d4c8772
[I 2024-05-19 21:53:40,678] Trial 0 finished with value: 0.8068652682629638 and parameters: {'n_estimators': 500, 'max_features': 'log2', 'max_depth': 90, 'min_samples_split': 6, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.8068652682629638.
[I 2024-05-19 21:53:41,810] Trial 1 finished with value: 0.7935527045882261 and parameters: {'n_estimators': 100, 'max_features': 'log2', 'max_depth': 70, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.8068652682629638.
[I 2024-05-19 21:53:44,911] Trial 2 finished with value: 0.7841972336022871 and parameters: {'n_estimators': 400, 'max_features': 'log2', 'max_depth': 70, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.8068652682629638.
[I 2024-05-19 21:53:46,563] Trial 3 finished with value: 0.8552016672591194 and parameters: {'n_estimators': 100, 'max_features': 'sq

In [38]:
def best_params_for_model(trial):
    param = {
        'C': trial.suggest_loguniform('C', 0.1, 10),  # Regularization parameter
        'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),  # Kernel type
        'degree': trial.suggest_int('degree', 2, 5),  # Degree for polynomial kernel (only for 'poly' kernel)
        'gamma': trial.suggest_categorical('gamma', ['auto', 'scale']),  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
    }
    
    svr = SVR(**param)
    
    score = cross_val_score(svr, X_train_svr, y_train_svr, cv=3, scoring='r2', n_jobs=-1).mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(best_params_for_model, n_trials=2)

print('Best trial:')
best_params = study.best_params
print('  Value: {:.3f}'.format(study.best_value))
print('  Params: ', best_params)
best_svr_model = SVR(**best_params)

[I 2024-05-19 21:54:02,741] A new study created in memory with name: no-name-2e97d6df-fad3-4d92-bb0e-27c0c3ab59ec
  'C': trial.suggest_loguniform('C', 0.1, 10),  # Regularization parameter
[I 2024-05-19 21:54:04,038] Trial 0 finished with value: 0.7926558565508248 and parameters: {'C': 3.7681584929043495, 'kernel': 'poly', 'degree': 4, 'gamma': 'scale'}. Best is trial 0 with value: 0.7926558565508248.
[I 2024-05-19 21:54:05,409] Trial 1 finished with value: 0.6660007003926531 and parameters: {'C': 0.6214608921373636, 'kernel': 'sigmoid', 'degree': 3, 'gamma': 'scale'}. Best is trial 0 with value: 0.7926558565508248.


Best trial:
  Value: 0.793
  Params:  {'C': 3.7681584929043495, 'kernel': 'poly', 'degree': 4, 'gamma': 'scale'}


### Optimized Models

In [39]:
models_optimized = []

models_optimized.extend([
    ('SVR Optuna', best_svr_model),
    ('XGBoost Optuna', best_xgb_model),
    ('LightGBM Optuna', best_lgb_model),
    ('CatBoost Optuna', best_cb_model),
    ('RFR Optuna', best_rfr_model)
])

In [40]:
optimized_model_result = evaluate_models(models_optimized)

  r2_df = pd.concat([r2_df, pd.DataFrame({'Model': [model_name], 'R2_train': [r2_train], 'R2_test': [r2_test]})], ignore_index=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3360
[LightGBM] [Info] Number of data points in the train set: 2342, number of used features: 19
[LightGBM] [Info] Start training from score 69.158369
0:	learn: 8.9896584	total: 4.14ms	remaining: 3.54s
1:	learn: 8.4554617	total: 7.92ms	remaining: 3.38s
2:	learn: 7.9685587	total: 12.5ms	remaining: 3.55s
3:	learn: 7.5167151	total: 16.6ms	remaining: 3.54s
4:	learn: 7.0931890	total: 20.6ms	remaining: 3.5s
5:	learn: 6.7100882	total: 25.2ms	remaining: 3.57s
6:	learn: 6.3720614	total: 29.4ms	remaining: 3.56s
7:	learn: 6.0379550	total: 33.4ms	remaining: 3.53s
8:	learn: 5.7339307	total: 37.4ms	remaining: 3.51s
9:	learn: 5.4502993	total: 42.5ms	remaining: 3.59s
10:	learn: 5.1959337	total: 46.6ms	remaining: 3.58s
11:	learn: 4.9582829	total: 50.3ms	remaining: 3.53s
12:	learn: 4.7461460	total: 53.9ms	remaining:

In [41]:
optimized_model_result

Unnamed: 0,Model,R2_train,R2_test
3,CatBoost Optuna,0.999012,0.969112
2,LightGBM Optuna,0.998689,0.968906
1,XGBoost Optuna,0.991498,0.96956
4,RFR Optuna,0.983608,0.884114
0,SVR Optuna,0.819527,0.813701


In [42]:
final_model_results = pd.concat([default_model_result, optimized_model_result], axis=0)
final_model_results = final_model_results.sort_values(by='R2_train', ascending=False)
final_model_results.reset_index(drop=True, inplace=True)
final_model_results

Unnamed: 0,Model,R2_train,R2_test
0,DecisionTreeRegressor,1.0,0.913922
1,XGB,0.999538,0.96751
2,CatBoost Optuna,0.999012,0.969112
3,LightGBM Optuna,0.998689,0.968906
4,RFR,0.994133,0.95891
5,CatBoost,0.993469,0.968466
6,Custom_CatBoost_Cat,0.992841,0.968598
7,XGBoost Optuna,0.991498,0.96956
8,LightGBM,0.991372,0.96606
9,RFR Optuna,0.983608,0.884114


# Univariate analysis with default XGBoost

- no matter we have created new 300 features, at the end Boosting works better

In [43]:
def univariate_analysis(model, X_train, y_train, X_test, y_test):
    variables = []
    train_r2_scores = []
    test_r2_scores = []


    for i in X_train.columns:
        X_train_single_var = X_train[[i]]
        X_test_single_var = X_test[[i]]
        
        model.fit(X_train_single_var, y_train)
        y_pred_train_single_var = model.predict(X_train_single_var)
        train_r2 = r2_score(y_train, y_pred_train_single_var)
        
        y_pred_test_single_var = model.predict(X_test_single_var)
        test_r2 = r2_score(y_test, y_pred_test_single_var)
        
        variables.append(i)
        train_r2_scores.append(train_r2)
        test_r2_scores.append(test_r2)
        
    results_df = pd.DataFrame({
        'Variable': variables,
        'Train R2': train_r2_scores,
        'Test R2': test_r2_scores
    })
    results_df_sorted = results_df.sort_values(by='Train R2', ascending=False)
    pd.options.display.float_format = '{:.4f}'.format
    return results_df_sorted

In [44]:
best_xgb_model_analysis = univariate_analysis(best_xgb_model, X_train_boost, y_train_boost, X_test_boost, y_test_boost)

In [45]:
best_xgb_model_analysis

Unnamed: 0,Variable,Train R2,Test R2
1,Adult Mortality,0.817,0.8374
17,Income composition of resources,0.7247,0.6498
12,HIV/AIDS,0.6867,0.6588
18,Schooling,0.6058,0.5733
7,BMI,0.6032,0.458
16,Thinness 5-9 years,0.5307,0.3905
8,Under-five deaths,0.5264,0.3563
15,Thinness 1-19 years,0.524,0.3893
2,Infant deaths,0.4616,0.3873
11,Diphtheria,0.4342,0.3958


In [46]:
final_xgb_model_input = best_xgb_model_analysis['Variable'].tolist()[:4]
final_xgb_model_train = X_train_boost[final_xgb_model_input]
final_xgb_model_test = X_test_boost[final_xgb_model_input]

In [47]:
best_xgb_model_final = best_xgb_model.fit(final_xgb_model_train, y_train_boost)

In [48]:
final_r2_train, final_r2_test = train_test_and_evaluate_model('Final XGB', best_xgb_model_final, final_xgb_model_train, y_train_boost, final_xgb_model_test, y_test_boost)

In [49]:
print(f"Final model: Optimized XGBOOST with these columns: {final_xgb_model_input}; R2 for Train: {final_r2_train}, R2 for Test: {final_r2_test}")

Final model: Optimized XGBOOST with these columns: ['Adult Mortality', 'Income composition of resources', 'HIV/AIDS', 'Schooling']; R2 for Train: 0.9785060275440891, R2 for Test: 0.9511510355136336


# Stacking

In [50]:
reg1 = best_cb_model
reg2 = best_lgb_model

meta_reg = xgboost_def

In [51]:
# Define the stacking classifier
stacking_regressor = StackingCVRegressor(
    regressors=[reg1, reg2],
    meta_regressor=meta_reg,
    cv=5,
    use_features_in_secondary=True,
    verbose=1,
    random_state=42
    )

In [52]:
final_xgb_model_train.fillna(0, inplace=True)
final_xgb_model_test.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_xgb_model_train.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_xgb_model_test.fillna(0, inplace=True)


In [53]:
stacking_r2_train, stacking_r2_test = train_test_and_evaluate_model('Stacking Regressor with XGB, Optimized Catboost and Optimized LGBM', stacking_regressor, final_xgb_model_train, y_train_boost, final_xgb_model_test, y_test_boost)

0:	learn: 8.9984360	total: 2.29ms	remaining: 1.95s
1:	learn: 8.4690924	total: 4.25ms	remaining: 1.81s
2:	learn: 7.9643957	total: 6.84ms	remaining: 1.94s
3:	learn: 7.5031136	total: 8.75ms	remaining: 1.86s
4:	learn: 7.0891909	total: 10.6ms	remaining: 1.81s
5:	learn: 6.7060543	total: 12.5ms	remaining: 1.76s
6:	learn: 6.3474962	total: 14.5ms	remaining: 1.76s
7:	learn: 6.0251512	total: 18.6ms	remaining: 1.97s
8:	learn: 5.7201293	total: 20.6ms	remaining: 1.94s
9:	learn: 5.4299908	total: 22.6ms	remaining: 1.91s
10:	learn: 5.1792053	total: 24.5ms	remaining: 1.88s
11:	learn: 4.9392291	total: 26.3ms	remaining: 1.85s
12:	learn: 4.7137522	total: 28.1ms	remaining: 1.82s
13:	learn: 4.4974411	total: 31.5ms	remaining: 1.89s
14:	learn: 4.3012986	total: 33.5ms	remaining: 1.87s
15:	learn: 4.1272736	total: 35.2ms	remaining: 1.85s
16:	learn: 3.9628217	total: 37ms	remaining: 1.82s
17:	learn: 3.8192631	total: 38.7ms	remaining: 1.8s
18:	learn: 3.6793511	total: 40.5ms	remaining: 1.78s
19:	learn: 3.5505165	tota

In [54]:
print(f"Stacking model: Optimized XGBOOST with these columns: {final_xgb_model_input}; R2 for Train: {stacking_r2_train}, R2 for Test: {stacking_r2_test}")

Stacking model: Optimized XGBOOST with these columns: ['Adult Mortality', 'Income composition of resources', 'HIV/AIDS', 'Schooling']; R2 for Train: 0.9816239558582408, R2 for Test: 0.9523744141051013
