# Split and normalize the dataset to fit and test

In [1]:
import pandas as pd
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor


In [2]:
df = pd.read_csv('cleaned_csv.csv')

df.columns

Index(['Year', 'Life expectancy', 'Adult Mortality', 'infant deaths',
       'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles', 'BMI',
       'under-five deaths',
       ...
       'United Republic of Tanzania', 'United States of America', 'Uruguay',
       'Uzbekistan', 'Vanuatu', 'Venezuela (Bolivarian Republic of)',
       'Viet Nam', 'Yemen', 'Zambia', 'Zimbabwe'],
      dtype='object', length=203)

## Split the dataset
The target column is 'Life expectancy'.

In [3]:
X = df.drop(['Life expectancy'], axis=1)
Y = df['Life expectancy']
X_train , X_test , Y_train , Y_test = train_test_split(X,Y, test_size = 0.2 , random_state = 1)

## Scale the dataset

In [4]:
y_scaler = preprocessing.MinMaxScaler()
x_scaler = preprocessing.MinMaxScaler()
X_train_scaled = x_scaler.fit_transform(X_train.values)
y_train_scaled = y_scaler.fit_transform(Y_train.values.reshape(-1, 1))
X_test_scaled = x_scaler.fit_transform(X_test.values)
y_test_scaled = y_scaler.fit_transform(Y_test.values.reshape(-1, 1))

## Models

### Lasso Regression

In [5]:
lasso_r = Lasso(alpha=0.01)
lasso_r.fit(X_train_scaled, y_train_scaled)
predictions = lasso_r.predict(X_test_scaled)

In [6]:
print('R2-squared:', lasso_r.score(X_test_scaled, y_test_scaled))
print('mean squared error:', mean_squared_error(y_test_scaled, predictions))
print('root squared mean squared error:', math.sqrt(mean_squared_error(y_test_scaled, predictions)))

R2-squared: 0.5700543748521967
mean squared error: 0.015230772567421504
root squared mean squared error: 0.12341301619935194


### Ridge Regression

In [7]:
ridge_r = Ridge(alpha=0.01, random_state=938)
ridge_r.fit(X_train_scaled, y_train_scaled)
predictions = ridge_r.predict(X_test_scaled)

In [8]:
print('R2-squared:', ridge_r.score(X_test_scaled, y_test_scaled))
print('mean squared error:', mean_squared_error(y_test_scaled, predictions))
print('root squared mean squared error:', math.sqrt(mean_squared_error(y_test_scaled, predictions)))

R2-squared: 0.9479431398825882
mean squared error: 0.0018441080700607384
root squared mean squared error: 0.0429430794198639


### Random Forest

In [9]:
rf_model = RandomForestRegressor(n_estimators = 10, random_state = 123)
rf_model = rf_model.fit(X_train_scaled, y_train_scaled)
predictions = rf_model.predict(X_test_scaled)

  


In [10]:
print('R2-squared:', rf_model.score(X_test_scaled, y_test_scaled))
print('mean squared error:', mean_squared_error(y_test_scaled, predictions))
print('root squared mean squared error:', math.sqrt(mean_squared_error(y_test_scaled, predictions)))

R2-squared: 0.9451047492199081
mean squared error: 0.0019446577212542152
root squared mean squared error: 0.044098273449809974
