**Ensemble Model Approach**

In [160]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re

import math
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn import metrics

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [78]:
#import the data
meta_data = pd.read_csv('petfinder-pawpularity-score/train.csv')

In [79]:
meta_data.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72


In [80]:
#split the data into train and test split so that we can still test our hyperparameters
meta_train, meta_test = train_test_split(meta_data, train_size=0.8, test_size=0.2, random_state=10)

In [81]:
#split data frames into X_train, y_train, X_test, y_test
#assign training labels
y_train = meta_train.pop('Pawpularity')
#assign training data
X_train = meta_train
#remove Ids from training data
X_train.pop('Id')

#assign testing labels
y_test = meta_test.pop('Pawpularity')
#assign testing data
X_test = meta_test
#remove Ids from testing data
X_test.pop('Id')

9161    ecbb48fc9d345f6e2b03deaf8f1645f0
9695    fa4a3d69e1e0e21b62bb33538bc54e61
9033    e97d059f75a50e9c9805b0dba4d0d84e
4617    76420f02afab76d2a6eab95efc816347
4220    6bb7f0653725b30118199fd763945713
                      ...               
6500    a8028d608d5a1916c5482616e5838b6c
8934    e6f6665772e5e67240d46d899e01ad78
2756    4708f6747261af730735ff0272cfc73e
3508    5a7fa7cfeb8d5e32116574e6be7ecb6a
3923    6411e12dd43ef887ac45984c01ebf850
Name: Id, Length: 1983, dtype: object

In [82]:
#quick view of training data
X_train.head()

Unnamed: 0,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur
6916,0,1,1,1,0,0,0,0,0,0,0,0
5837,0,1,1,1,0,0,0,0,0,0,0,0
2600,0,1,1,1,0,0,0,0,0,0,0,0
2167,0,0,0,0,1,0,1,0,0,1,0,0
7026,0,1,1,1,0,0,0,0,0,0,0,0


In [83]:
#quick view of testing data
y_train.head()

6916     6
5837    65
2600    23
2167    20
7026     2
Name: Pawpularity, dtype: int64

**Ensemble Classifier 1:**
1. Random Forest
2. SVM with RBF Kernel
3. Logistic Regression

In [130]:
#Create Ensemble Model 1
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('svr', make_pipeline(StandardScaler(), SVC(kernel='rbf',random_state=42)))]
clf_1 = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

In [131]:
#fit the model with the training data and labels
clf_1.fit(X_train, y_train)

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(n_estimators=10,
                                                       random_state=42)),
                               ('svr',
                                Pipeline(steps=[('standardscaler',
                                                 StandardScaler()),
                                                ('svc',
                                                 SVC(random_state=42))]))],
                   final_estimator=LogisticRegression())

In [132]:
#generate predictions on the test data set
y_pred_1 = clf_1.predict(X_test)

In [133]:
#calculate RMSE
RMSE_1 = math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred_1))
print('RMSE: ')
print(RMSE_1)

RMSE: 
23.183051088914624


**Ensemble Classifier 2:**
1. Random Forest
2. SVM with Linear Kernel

In [142]:
#Create Ensemble Model 2
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42))]
clf_2 = StackingClassifier(estimators=estimators, final_estimator=SVC(kernel='linear'))

In [143]:
#fit the model with the training data and labels
clf_2.fit(X_train, y_train)

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(n_estimators=50,
                                                       random_state=42))],
                   final_estimator=SVC(kernel='linear'))

In [144]:
#generate predictions on the test data set
y_pred_2 = clf_2.predict(X_test)

In [145]:
#calculate RMSE
RMSE_2 = math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred_2))
print('RMSE: ')
print(RMSE_2)

RMSE: 
21.69115726697019


**Ensemble Classifier 3:**
1. Random Forest
2. Logistic Regression
3. SVM with Linear Kernel

In [154]:
#Create Ensemble Model 3
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)), ('lr', LogisticRegression())]
clf_3 = StackingClassifier(estimators=estimators, final_estimator=SVC(kernel='linear'))

In [155]:
#fit the model with the training data and labels
clf_3.fit(X_train, y_train)

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(n_estimators=10,
                                                       random_state=42)),
                               ('lr', LogisticRegression())],
                   final_estimator=SVC(kernel='linear'))

In [158]:
#generate predictions on the test data set
y_pred_3 = clf_3.predict(X_test)

In [159]:
#calculate RMSE
RMSE_3 = math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred_3))
print('RMSE: ')
print(RMSE_3)

RMSE: 
21.691110769960467


**Ensemble Classifier 4:**
1. KNN
2. Decision Tree
3. SVM 

In [176]:
#Create Ensemble Model 4
estimators = [('knn', KNeighborsClassifier(n_neighbors=1000)), ('dt', DecisionTreeClassifier(max_depth=25))]
clf_4 = StackingClassifier(estimators=estimators, final_estimator=SVC(kernel='linear'))

In [177]:
#fit the model with the training data and labels
clf_4.fit(X_train, y_train)

StackingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=1000)),
                               ('dt', DecisionTreeClassifier(max_depth=25))],
                   final_estimator=SVC(kernel='linear'))

In [178]:
#generate predictions on the test data set
y_pred_4 = clf_4.predict(X_test)

In [179]:
#calculate RMSE
RMSE_4 = math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred_4))
print('RMSE: ')
print(RMSE_4)

RMSE: 
21.697154545717193


**Ensemble 5: Only Random Forest**

In [182]:
clf_5 = RandomForestClassifier(n_estimators=50, criterion='gini')
clf_5.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50)

In [183]:
y_pred_5 = clf_5.predict(X_test)

In [184]:
#calculate RMSE
RMSE_5 = math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred_5))
print('RMSE: ')
print(RMSE_5)

RMSE: 
23.87246539708768
