In [243]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [244]:
dataset = pd.read_csv('Life Expectancy Data.csv')
dataset.head()
print(dataset)

          Country  Year      Status  Life expectancy   Adult Mortality  \
0     Afghanistan  2015  Developing              65.0            263.0   
1     Afghanistan  2014  Developing              59.9            271.0   
2     Afghanistan  2013  Developing              59.9            268.0   
3     Afghanistan  2012  Developing              59.5            272.0   
4     Afghanistan  2011  Developing              59.2            275.0   
...           ...   ...         ...               ...              ...   
2933     Zimbabwe  2004  Developing              44.3            723.0   
2934     Zimbabwe  2003  Developing              44.5            715.0   
2935     Zimbabwe  2002  Developing              44.8             73.0   
2936     Zimbabwe  2001  Developing              45.3            686.0   
2937     Zimbabwe  2000  Developing              46.0            665.0   

      infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles   \
0                62     0.01    

In [222]:
X = dataset.iloc[:,:].values
y = dataset.iloc[:20,3].values

In [223]:
X[:5]

array([['Afghanistan', 2015, 'Developing', 65.0, 263.0, 62, 0.01,
        71.27962362, 65.0, 1154, 19.1, 83, 6.0, 8.16, 65.0, 0.1,
        584.25921, 33736494.0, 17.2, 17.3, 0.479, 10.1],
       ['Afghanistan', 2014, 'Developing', 59.9, 271.0, 64, 0.01,
        73.52358168, 62.0, 492, 18.6, 86, 58.0, 8.18, 62.0, 0.1,
        612.696514, 327582.0, 17.5, 17.5, 0.476, 10.0],
       ['Afghanistan', 2013, 'Developing', 59.9, 268.0, 66, 0.01,
        73.21924272, 64.0, 430, 18.1, 89, 62.0, 8.13, 64.0, 0.1,
        631.744976, 31731688.0, 17.7, 17.7, 0.47, 9.9],
       ['Afghanistan', 2012, 'Developing', 59.5, 272.0, 69, 0.01,
        78.1842153, 67.0, 2787, 17.6, 93, 67.0, 8.52, 67.0, 0.1, 669.959,
        3696958.0, 17.9, 18.0, 0.463, 9.8],
       ['Afghanistan', 2011, 'Developing', 59.2, 275.0, 71, 0.01,
        7.097108703, 68.0, 3013, 17.2, 97, 68.0, 7.87, 68.0, 0.1,
        63.537231, 2978599.0, 18.2, 18.2, 0.454, 9.5]], dtype=object)

In [224]:
X[:,3]

array([65.0, 59.9, 59.9, ..., 44.8, 45.3, 46.0], dtype=object)

In [225]:
labelEncoder_X = LabelEncoder()
X[:,0] = labelEncoder_X.fit_transform(X[:,0])
X[:,2] = labelEncoder_X.fit_transform(X[:,2])

In [226]:
X = X[:20,:5]

In [227]:
ct = ColumnTransformer([('OneHotEncoder', OneHotEncoder(sparse=False), [3])], remainder='passthrough')

ex =ct.fit_transform(X)
X = np.array(ct.fit_transform(X))

X

array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 0.0, 0.0, 0.0, 0, 2015, 1, 263.0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2014, 1, 271.0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2013, 1, 268.0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2012, 1, 272.0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2011, 1, 275.0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2010, 1, 279.0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2009, 1, 281.0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2008, 1, 

In [228]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=22)

In [229]:
print(type(X_test))
print(X_test.shape)
X_test

<class 'numpy.ndarray'>
(4, 22)


array([[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2005, 1, 291.0],
       [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2001, 1, 316.0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 0.0, 1.0, 0.0, 1, 2014, 1, 8.0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
        0.0, 0.0, 0.0, 0.0, 0.0, 0, 2012, 1, 272.0]], dtype=object)

In [231]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [232]:
y_pred = regressor.predict(X_test)

In [234]:
y_pred

array([57.32768366, 57.05788924, 76.20515669, 59.1895311 ])

In [236]:
df = pd.DataFrame({'Actual': y_test ,'Predicted': y_pred})
df.head()

Unnamed: 0,Actual,Predicted
0,57.3,57.327684
1,55.3,57.057889
2,77.5,76.205157
3,59.5,59.189531


In [246]:
cols = list(dataset.columns)
print(dataset.columns[:5])
print(regressor.coef_)
coeff_df = pd.DataFrame(regressor.coef_, cols, columns = ['Coefficient'])
coeff_df

Index(['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality'], dtype='object')
[-2.03641995e+00 -4.50750548e-14 -9.60592598e+00 -1.10534807e+00
 -9.43766374e-01 -7.54501016e-01 -6.37551998e-01 -3.42072264e-01
 -9.12252098e-02 -2.96435127e-02  1.76570863e-01  0.00000000e+00
  5.16683276e-01  5.31216301e+00  3.02027080e+00  3.18185250e+00
  0.00000000e+00  3.33891393e+00  9.54103723e+00  8.30509818e-02
  0.00000000e+00 -2.76836606e-02]


Unnamed: 0,Coefficient
Country,-2.03642
Year,-4.507505e-14
Status,-9.605926
Life expectancy,-1.105348
Adult Mortality,-0.9437664
infant deaths,-0.754501
Alcohol,-0.637552
percentage expenditure,-0.3420723
Hepatitis B,-0.09122521
Measles,-0.02964351


In [247]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))


Mean Absolute Error: 0.8477212769370155
Mean Squared Error: 1.2159877723309027
Root Mean Squared Error: 1.1027183558510771
