<a href="https://colab.research.google.com/github/prof-rossetti/data-analytics-in-python/blob/main/units/4-predictive-analytics/Life_Expectancy_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

from pandas import read_csv

csv_filename = "life_expectancies.csv"

if not os.path.isfile(csv_filename):
    print("DOWNLOADING CSV...")
    !wget -q https://raw.githubusercontent.com/prof-rossetti/data-analytics-in-python/main/units/4-predictive-analytics/data/life_expectancies.csv

df = read_csv(csv_filename)
print("ROWS:", len(df))
print("COLS:", df.columns.tolist())
print(df.head())

# looks like:
#  "Country" is the row id
#  "LifeExpectancy" is the target variable
#  "DoctorsPer100K"is the feature


DOWNLOADING CSV...
ROWS: 175
COLS: ['Country', 'LifeExpectancy', 'DoctorsPer100K']
         Country  LifeExpectancy  DoctorsPer100K
0   Afghanistan             60.5             2.7
1       Albania             77.8            11.5
2       Algeria             75.6            12.1
3        Angola             52.4             1.7
4     Argentina             76.3            38.6


In [4]:

for col in df.columns.tolist():
    print("")
    print("------------")
    print("COL: ", col.upper())
    print("------------")
    print(sorted(list(set(df[col].tolist()))))
    print(df[col].describe())
    #print(df[col].value_counts(normalize=True))

# in terms of describing the variables, looks like:
#   "LifeExpectancy" is continuous decimal from 50.1 to 83.7, with a mean of 71.3
#   "DoctorsPer100K" is continuous decimal from 0.1 to 54.0, with a mean of 15.4

# doesn't look like there are any null values in any of the columns. that's good. so we don't need to "impute any missing values

# although we might consider scaling the values between 0 and 1



------------
COL:  COUNTRY
------------
[' Afghanistan ', ' Albania ', ' Algeria ', ' Angola ', ' Argentina ', ' Armenia ', ' Australia ', ' Austria ', ' Azerbaijan ', ' Bahamas ', ' Bahrain ', ' Bangladesh ', ' Belarus ', ' Belgium ', ' Belize ', ' Benin ', ' Bhutan ', ' Bolivia ', ' Bosnia and Herzegovina ', ' Botswana ', ' Brazil ', ' Brunei ', ' Bulgaria ', ' Burkina Faso ', ' Burundi ', ' Cambodia ', ' Cameroon ', ' Canada ', ' Cape Verde ', ' Central African Republic ', ' Chad ', ' Chile ', ' China ', ' Colombia ', ' Comoros ', ' Costa Rica ', ' Croatia ', ' Cyprus ', ' Czech Republic ', ' DR Congo ', ' Denmark ', ' Djibouti ', ' Dominican Republic ', ' Ecuador ', ' Egypt ', ' El Salvador ', ' Equatorial Guinea ', ' Eritrea ', ' Estonia ', ' Ethiopia ', ' Fiji ', ' Finland ', ' France ', ' Gabon ', ' Gambia ', ' Georgia ', ' Germany ', ' Ghana ', ' Greece ', ' Guatemala ', ' Guinea ', ' Guinea-Bissau ', ' Guyana ', ' Honduras ', ' Hungary ', ' Iceland ', ' India ', ' Indonesia '

In [35]:
import plotly.express as px

px.scatter(df, x='DoctorsPer100K', y='LifeExpectancy', trendline="ols" 
    #, color="LifeExpectancy", 
)


In [43]:

from pprint import pprint
from pdb import  set_trace as breakpoint

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.pipeline import  Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#
# TEST/TRAIN SPLIT
#

df_train, df_test = train_test_split(df, test_size=0.2, random_state=99)
print("TEST TRAIN SPLIT:", len(df_train), len(df_test))

features = "DoctorsPer100K"
target = "LifeExpectancy"

x_train = np.array(df_train[features]).reshape((-1, 1)) # reshape to avoid fit error. the model wants this shape
y_train = df_train[target] # np.array(df_train[target]).reshape((-1, 1))
print("TRAIN:", x_train.shape, y_train.shape) #> (140, 1) (140,)

x_test = np.array(df_test[features]).reshape((-1, 1)) # reshape to avoid fit error. the model wants this shape
y_test = df_test[target] # np.array(df_test[target]).reshape((-1, 1))
print("TEST:", x_test.shape, y_test.shape)


TEST TRAIN SPLIT: 140 35
TRAIN: (140, 1) (140,)
TEST: (35, 1) (35,)


In [45]:

#
# MODEL TRAINING
#

model = LinearRegression()
model.fit(x_train, y_train) 

#
# MODEL EVALUATION
#

print("------------------")
print("TRAINING SCORES...")
print("------------------")
y_train_pred = model.predict(x_train)
print("R^2 SCORE:", r2_score(y_train, y_train_pred))
print("MEAN ABS ERR:", mean_absolute_error(y_train, y_train_pred))
print("MEAN SQ ERR:", mean_squared_error(y_train, y_train_pred))

print("------------------")
print("TESTING SCORES...")
print("------------------")
y_test_pred = model.predict(x_test)
print("R^2 SCORE:", r2_score(y_test, y_test_pred))
print("MEAN ABS ERR:", mean_absolute_error(y_test, y_test_pred))
print("MEAN SQ ERR:", mean_squared_error(y_test, y_test_pred))


------------------
TRAINING SCORES...
------------------
R^2 SCORE: 0.49340903444718687
MEAN ABS ERR: 4.800109230130603
MEAN SQ ERR: 33.851248587233364
------------------
TESTING SCORES...
------------------
R^2 SCORE: 0.49340903444718687
MEAN ABS ERR: 4.800109230130603
MEAN SQ ERR: 33.851248587233364


In [50]:

print("------------------")
print("MODEL PREDICTIONS...")
print("------------------")


pred = model.predict([[10.2]])
print(pred) #> [69.13009545]

preds = model.predict([ [1.8], [10.2], [49.5] ])
print(preds) #> [65.75206424 69.13009545 84.93445578]



------------------
MODEL PREDICTIONS...
------------------
[69.13009545]
[65.75206424 69.13009545 84.93445578]
