## Model Building

### Task

**<a href="#OLS" style="text-decoration:none;">a. Ordinary least regression</a>**<br>
**<a href="#mul" style="text-decoration:none;">b. Multiple linear regression</a>**<br>
**<a href="#svm" style="text-decoration:none;">c. Support vector regressor</a>**<br>
**<a href="#lr" style="text-decoration:none;">d. Lasso regression</a>**<br>
**<a href="#rf" style="text-decoration:none;">e. Random forest regressor</a>**<br>
**<a href="#gs" style="text-decoration:none;">f. GridSearchCV turning</a>**<br>
**<a href="#es" style="text-decoration:none;">g. Ensembling</a>**<br>

####  Performance
R-squared and mean absolute error<br>


In [1]:
#import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm  #for regression statistics
from sklearn.linear_model import LinearRegression #for building regression model
linear_model = LinearRegression()
from sklearn.metrics import mean_squared_error, r2_score #for linear regression evaluations
from sklearn.model_selection import train_test_split #for spliting our datasets
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
regressor = SVR (kernel = "rbf")

In [2]:
#load our dataset
df = pd.read_csv(r"C:\Users\lenovo\Desktop\DSN\December data science project\Regression (Life Expectancy prediction)\eda_df.csv")

In [3]:
df.columns

Index(['Country', 'Status', 'Lifeexpectancy', 'AdultMortality', 'infantdeaths',
       'Alcohol', 'percentageexpenditure', 'HepatitisB', 'Measles', 'BMI',
       'under-fivedeaths', 'Polio', 'Totalexpenditure', 'Diphtheria',
       'HIV/AIDS', 'GDP', 'Population', 'thinness1-19years',
       'thinness5-9years', 'Incomecompositionofresources', 'Schooling',
       'Region', 'IncomeGroup'],
      dtype='object')

In [4]:
#Remove columns that are not related to the target variabl following our EDA
df.drop(["Country", "infantdeaths", "Measles", "Population"], axis = 1, inplace = True)

In [5]:
#Encoding categorical variable
import numpy as np
cat_features = df.select_dtypes(exclude =np.number)
cat_features

Unnamed: 0,Status,Region,IncomeGroup
0,Developing,South Asia,Low income
1,Developing,Europe & Central Asia,Upper middle income
2,Developing,Middle East & North Africa,Lower middle income
3,Developing,Sub-Saharan Africa,Lower middle income
4,Developing,Latin America & Caribbean,High income
...,...,...,...
160,Developing,East Asia & Pacific,Lower middle income
161,Developing,Latin America & Caribbean,Upper middle income
162,Developing,Middle East & North Africa,Low income
163,Developing,Sub-Saharan Africa,Lower middle income


In [6]:
#Create dummies variable for our categorical variables (Status, Region and Income Group)
for col in cat_features:
    print("{} has {} unique categories". format(col, df[col].nunique()))

Status has 2 unique categories
Region has 7 unique categories
IncomeGroup has 4 unique categories


In [7]:
df = pd.get_dummies(columns = ["Status", "Region", "IncomeGroup"], data = df)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 29 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Lifeexpectancy                     165 non-null    float64
 1   AdultMortality                     165 non-null    float64
 2   Alcohol                            165 non-null    float64
 3   percentageexpenditure              165 non-null    float64
 4   HepatitisB                         165 non-null    float64
 5   BMI                                165 non-null    float64
 6   under-fivedeaths                   165 non-null    float64
 7   Polio                              165 non-null    float64
 8   Totalexpenditure                   165 non-null    float64
 9   Diphtheria                         165 non-null    float64
 10  HIV/AIDS                           165 non-null    float64
 11  GDP                                165 non-null    float64

In [9]:
#Seperating target variable from other features
X = df.drop(["Lifeexpectancy"], axis = 1)
y = df["Lifeexpectancy"]

In [10]:
#Spliting of the datasets
from sklearn import model_selection
validation_size = 0.20
seed = 42
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = validation_size, random_state = seed)

<p id='OLS'></p>
<br><br>

### Ordinary least regression

In [11]:
X_sm = X = sm.add_constant(X)
model = sm.OLS(y, X_sm)
model.fit().summary()

0,1,2,3
Dep. Variable:,Lifeexpectancy,R-squared:,0.942
Model:,OLS,Adj. R-squared:,0.932
Method:,Least Squares,F-statistic:,90.95
Date:,"Thu, 14 Jan 2021",Prob (F-statistic):,1.0900000000000001e-73
Time:,19:05:30,Log-Likelihood:,-364.07
No. Observations:,165,AIC:,780.1
Df Residuals:,139,BIC:,860.9
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,33.1568,1.362,24.352,0.000,30.465,35.849
AdultMortality,-0.0419,0.005,-9.106,0.000,-0.051,-0.033
Alcohol,0.0151,0.091,0.165,0.869,-0.166,0.196
percentageexpenditure,0.0007,0.000,1.701,0.091,-0.000,0.001
HepatitisB,-0.0421,0.022,-1.932,0.055,-0.085,0.001
BMI,0.0182,0.030,0.614,0.540,-0.040,0.077
under-fivedeaths,-0.0019,0.002,-1.242,0.216,-0.005,0.001
Polio,0.0305,0.043,0.712,0.478,-0.054,0.115
Totalexpenditure,0.1102,0.141,0.780,0.437,-0.169,0.390

0,1,2,3
Omnibus:,6.859,Durbin-Watson:,2.095
Prob(Omnibus):,0.032,Jarque-Bera (JB):,11.832
Skew:,-0.022,Prob(JB):,0.0027
Kurtosis:,4.311,Cond. No.,1.23e+16


<p id='mul'></p>
<br><br>

### Multiple linear regression

In [12]:
from sklearn.metrics import mean_absolute_error
linear_model.fit(X_train, y_train)
np.mean(cross_val_score(linear_model, X_train, y_train, scoring = "neg_mean_absolute_error", cv = 3))

-2.2306026577866316

<p id='SVM'></p>
<br><br>

### Support vector regressor

In [13]:
regressor.fit(X_train, y_train)
np.mean(cross_val_score(regressor, X_train, y_train, scoring = "neg_mean_absolute_error", cv =3))

-5.562800403467823

<p id='lr'></p>
<br><br>

### Lasso regression

In [15]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)
np.mean(cross_val_score(lasso_reg, X_train, y_train, scoring = "neg_mean_absolute_error", cv = 3))

  positive)


-2.202747391146455

<p id='lr'></p>
<br><br>

### Random Forest Regressor

In [16]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
np.mean(cross_val_score(rf, X_train, y_train, scoring = "neg_mean_absolute_error", cv = 3))

-1.9231926451969708

<p id='gs'></p>
<br><br>

### GridSearchCV

In [17]:
from sklearn.model_selection import GridSearchCV
params = {'n_estimators': range (10, 300, 100), 'criterion': ('mae', 'mse'), 'max_features': ('auto','sqrt','log2')}
rfgs = GridSearchCV(rf, params, scoring = "neg_mean_absolute_error", cv =3)
rfgs.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'criterion': ('mae', 'mse'),
                         'max_features': ('auto', 'sqrt', 'log2'),
                         'n_estimators': range(10, 300, 100)},
             scoring='neg_mean_absolute_error')

In [18]:
rfgs.best_score_

-1.8326445707486207

In [19]:
rfgs.best_estimator_

RandomForestRegressor(criterion='mae', n_estimators=110)

In [27]:
alpha = [0.001, 0.01, 0.1,1, 10, 100, 1000]
params = dict(alpha = alpha)
lasso_gs = GridSearchCV(lasso_reg, params, scoring = "neg_mean_absolute_error", cv =3)
lasso_gs.fit(X_train, y_train)

  positive)
  positive)


GridSearchCV(cv=3, estimator=Lasso(alpha=0.001),
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
             scoring='neg_mean_absolute_error')

In [28]:
lasso_gs.best_score_

-2.0430452780394717

In [29]:
lasso_gs.best_estimator_

Lasso(alpha=0.1)

In [33]:
tpred_lm = linear_model.predict(X_test)
tpred_reg = regressor.predict(X_test)
tpred_gs = rfgs.best_estimator_.predict(X_test)
tpred_lasso_gs = lasso_gs.best_estimator_.predict(X_test)

In [34]:
print("Linear Regression R2:", mean_absolute_error(y_test, tpred_lm))
print("Support vector regressor R2:", mean_absolute_error(y_test, tpred_reg))
print("Random forest regressor R2:", mean_absolute_error(y_test, tpred_gs))
print(mean_absolute_error(y_test, tpred_lasso_gs))

1.8063402594219962
4.682397329150836
2.019403617967469
2.019403617967469


In [None]:
mean_absolute_error(y_test,(tpred_lm+tpred_gs)/2)

In [35]:
print("Linear Regression R2:", r2_score(y_test, tpred_lm))
print("Support vector regressor R2:", r2_score(y_test, tpred_reg))
print("Random forest regressor R2:" r2_score(y_test, tpred_gs))
print("lasso regressor R2:", r2_score(y_test, tpred_lasso_gs))

0.9369401391480359
0.40717764788897004
0.9316760372771723
0.9316760372771723
