# Building Our Linear Regression Model

## Importing and uploading csv file

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.tools

In [9]:
df = pd.read_csv("Life Expectancy Data.csv")

## Summary Statistics 

In [10]:
df.describe()

Unnamed: 0,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
count,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0,2864.0
mean,2007.5,30.363792,42.938268,192.251775,4.820882,84.292598,77.344972,25.032926,86.499651,86.271648,0.894288,11540.92493,36.675915,4.865852,4.899825,7.632123,0.206704,0.793296,68.856075
std,4.610577,27.538117,44.569974,114.910281,3.981949,15.995511,18.659693,2.193905,15.080365,15.534225,2.381389,16934.788931,136.485867,4.438234,4.525217,3.171556,0.405012,0.405012,9.405608
min,2000.0,1.8,2.3,49.384,0.0,12.0,10.0,19.8,8.0,16.0,0.01,148.0,0.08,0.1,0.1,1.1,0.0,0.0,39.4
25%,2003.75,8.1,9.675,106.91025,1.2,78.0,64.0,23.2,81.0,81.0,0.08,1415.75,2.0975,1.6,1.6,5.1,0.0,1.0,62.7
50%,2007.5,19.6,23.1,163.8415,4.02,89.0,83.0,25.5,93.0,93.0,0.15,4217.0,7.85,3.3,3.4,7.8,0.0,1.0,71.4
75%,2011.25,47.35,66.0,246.791375,7.7775,96.0,93.0,26.4,97.0,97.0,0.46,12557.0,23.6875,7.2,7.3,10.3,0.0,1.0,75.4
max,2015.0,138.1,224.9,719.3605,17.87,99.0,99.0,32.1,99.0,99.0,21.68,112418.0,1379.86,27.7,28.6,14.1,1.0,1.0,83.8


## Explanation of Removing Certain Columns from ethical model
When constructing a minimal model that avoids using sensitive health data, the following columns were removed due to various ethical, privacy, and practical considerations. They are grouped into categories for clarity:

#### Deaths Data
Infant_deaths,
Under_five_deaths,
Adult_mortality,

Reason:
Sensitive Nature: Mortality rates are highly sensitive indicators of a country's healthcare system and overall well-being. Sharing such data can have significant social and political implications, potentially leading to criticism and unwanted attention.
Privacy Concerns: Countries might be reluctant to share mortality data due to its direct reflection on public health infrastructure and effectiveness.
Ethical Considerations: It is ethically sensitive to use mortality data in models that could affect a country's image or policy decisions.
#### Vaccination Data
Hepatitis_B,
Measles,
Polio,
Diphtheria,

Reason:
Public Health Implications: Vaccination rates are critical indicators of public health initiatives and the effectiveness of immunization programs. Sharing this data can expose gaps in public health policies and vaccination coverage.
Confidentiality: Countries might consider vaccination data confidential due to potential public scrutiny and the risk of revealing weaknesses in their healthcare systems.
Political Sensitivity: Vaccination data can be politically sensitive, especially in regions with ongoing health crises or vaccination controversies.
#### Thinness Data
Thinness_ten_nineteen_years,
Thinness_five_nine_years,

Reason:
Nutritional Status: Data on thinness among children and teenagers is a direct indicator of nutrition and health. Sharing such data can highlight malnutrition or health issues in the youth population.
Sensitivity: This data is often considered highly sensitive as it pertains to the well-being of the youngest members of society. Countries may prefer to keep this information private to avoid criticism and negative attention regarding child health and welfare.
Ethical Implications: Using data on child malnutrition in models could have ethical implications, as it reflects on the country's ability to provide adequate nutrition and healthcare for its children.
#### HIV Data
Incidents_HIV

Reason:
Stigma and Confidentiality: HIV incidence is one of the most sensitive health indicators due to the associated stigma and confidentiality concerns. Countries may be hesitant to share this data to protect the privacy of individuals and avoid negative societal implications.
National Image: Publicizing HIV data can have severe implications for a country's image and international relations. It can also impact public perception and lead to stigma against affected individuals.
#### BMI Data
BMI (Body Mass Index)

Reason:
Personal Health Metric: BMI is a personal health metric that can reveal insights into the population's nutrition and lifestyle. Sharing this data can be seen as intrusive and countries might prefer to keep it private to avoid revealing issues related to obesity or undernutrition.
Health Implications: This data can have significant health implications, reflecting on the overall lifestyle and dietary habits of the population, which countries may want to keep confidential.

## Linear Regression Function

In [5]:
def linear_regression(data, minimal=False, custom=None):

    #Feature Engineering Function
    def feature_engineering(df):
        df = df.copy()
        df = pd.get_dummies(df, columns=["Region"], drop_first=True, prefix="region", dtype=int)        #One Hot Encoding the region column
        df["GDP_per_capita"] = df["GDP_per_capita"] / 1000                                              #Convert GDP into easier to use units
        df = sm.add_constant(df)
        return df
    
    #Reading the data
    df = pd.read_csv(data)

    #Setting up variables and target
    fcols = list(df.columns)
    fcols.remove("Life_expectancy")
    fcols.remove("Country")
    X = df[fcols]
    y = df.Life_expectancy

    #Train-test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

    #Applying Feature Engineering Function
    X_train_fe = feature_engineering(X_train)

    #Specifying columns to use in case minimal version is used
    if custom:
        cols = custom #Allows users to use a custom set of columns with the model
    elif minimal: #Without private medical data
        cols = ['const', 'GDP_per_capita', 'Schooling', 'region_Asia', 'region_Central America and Caribbean', 'region_European Union','region_Middle East', 'region_North America', 
                'region_Oceania','region_Rest of Europe', 'region_South America']

    else: #With private medical data
        cols = ['const','Adult_mortality','Under_five_deaths',
                'GDP_per_capita','region_Central America and Caribbean', 'region_European Union']
        
    #Creating and fitting the model
    lin_reg = sm.OLS(y_train, X_train_fe[cols])
    results = lin_reg.fit()

    #Using model to make predictions on training data and report Root Mean Squared Error
    y_pred = results.predict(X_train_fe[cols])
    train_rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)
    print(f"Training data RMSE:\t{train_rmse}")

    #Feature Engineering on test data to match training
    X_test_fe = feature_engineering(X_test)

    #Predicting on testing data and reporting RMSE
    y_test_pred = results.predict(X_test_fe[cols])
    test_rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
    print(f"Testing data RMSE:\t{test_rmse}")

    return results


## Results 

In [None]:
results = linear_regression("Life Expectancy Data.csv", minimal=False)
results.summary()