**This approach is only using the meta data with linear regression because the performance with the image data was very poor.**

**Models Attempted in this Notebook:**
1. Linear Regression
2. Ridge Regression
3. Lasso Regression
4. Logistic Regression

In [53]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re

import math
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [54]:
#import the data
meta_data = pd.read_csv('petfinder-pawpularity-score/train.csv')

In [55]:
#show data header
meta_data.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72


In [56]:
#split the data into train and test split so that we can still test our hyperparameters
meta_train, meta_test = train_test_split(meta_data, train_size=0.8, test_size=0.2, random_state=10)

In [57]:
#split data frames into X_train, y_train, X_test, y_test
#assign training labels
y_train = meta_train.pop('Pawpularity')
#assign training data
X_train = meta_train
#remove Id from training data
X_train.pop('Id')

#assign testing labels
y_test = meta_test.pop('Pawpularity')
#assign testing data
X_test = meta_test
#remove Ids from testing data
X_test.pop('Id')

9161    ecbb48fc9d345f6e2b03deaf8f1645f0
9695    fa4a3d69e1e0e21b62bb33538bc54e61
9033    e97d059f75a50e9c9805b0dba4d0d84e
4617    76420f02afab76d2a6eab95efc816347
4220    6bb7f0653725b30118199fd763945713
                      ...               
6500    a8028d608d5a1916c5482616e5838b6c
8934    e6f6665772e5e67240d46d899e01ad78
2756    4708f6747261af730735ff0272cfc73e
3508    5a7fa7cfeb8d5e32116574e6be7ecb6a
3923    6411e12dd43ef887ac45984c01ebf850
Name: Id, Length: 1983, dtype: object

In [58]:
#quick look at training data
X_train.head()

Unnamed: 0,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur
6916,0,1,1,1,0,0,0,0,0,0,0,0
5837,0,1,1,1,0,0,0,0,0,0,0,0
2600,0,1,1,1,0,0,0,0,0,0,0,0
2167,0,0,0,0,1,0,1,0,0,1,0,0
7026,0,1,1,1,0,0,0,0,0,0,0,0


In [59]:
#quick look at training labels
y_train.head()

6916     6
5837    65
2600    23
2167    20
7026     2
Name: Pawpularity, dtype: int64

In [61]:
#Create a linear regression model
lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()

In [62]:
#above cell started at 7:11PM and ended around 7:20
y_pred_lin_reg = lm.predict(X_test)

#calculate R squared value
r2 = sklearn.metrics.r2_score(y_test, y_pred_lin_reg)
print(r2)
#calculate mean absolute error
neg_mean_abs_err = sklearn.metrics.mean_absolute_error(y_test, y_pred_lin_reg)
print(neg_mean_abs_err)
#calculate root mean square error
lin_reg_RMSE = math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred_lin_reg))
lin_reg_RMSE

0.0020122486417494256
15.20007403789931


20.405777172912124

**Attmepting Cross Validation Scoring**

In [63]:
#get RMSE score using cross validation
lm_scores = cross_val_score(lm, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
lm_scores  

array([-20.82178305, -19.92767999, -20.75745571, -21.06901215,
       -20.66647169])

**Lets also try L2 regularization using ridge regression, which minimizes the loss function of linear least squares. The following tests multiple alpha values, fitting the model for each value of alpha as it runs over the for loop**

In [50]:
#list of alphas to test
alpha = [0.001, 0.1, 1, 2, 5]

#loop over each value of alpha, fit model, print results
for i in alpha:
    ridge = Ridge(alpha=i)
    ridge.fit(X_train, y_train)
    y_pred_ridge = ridge.predict(X_test)
    print('-'*20)
    print('Alpha: '+str(i))
    r2_ridge = sklearn.metrics.r2_score(y_test, y_pred_ridge)
    print('R^2: '+str(r2_ridge))
    neg_mean_abs_err_ridge = sklearn.metrics.mean_absolute_error(y_test, y_pred_ridge)
    print('Neg. Mean Abs. Error: '+str(neg_mean_abs_err_ridge))
    RMSE_ridge = math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred_ridge))
    print('RMSE: '+str(RMSE_ridge))

--------------------
Alpha: 0.001
R^2: 0.002012237843247666
Neg. Mean Abs. Error: 15.200074073392154
RMSE: 20.405777283310183
--------------------
Alpha: 0.1
R^2: 0.00201116907894594
Neg. Mean Abs. Error: 15.200077584783395
RMSE: 20.405788209777068
--------------------
Alpha: 1
R^2: 0.0020014791683298228
Neg. Mean Abs. Error: 15.200109289408063
RMSE: 20.405887273903698
--------------------
Alpha: 2
R^2: 0.0019907679298079017
Neg. Mean Abs. Error: 15.200144061614234
RMSE: 20.405996778945447
--------------------
Alpha: 5
R^2: 0.0019589849859891206
Neg. Mean Abs. Error: 15.20024555990199
RMSE: 20.406321704539742


**Lets try fitting the Ridge model with the best alpha from above and running the cross validation score function**

In [64]:
#calculating RMSE on ridge model using cross validation (L2 regularization)
ridge = Ridge(alpha=0.001)
ridge_scores = cross_val_score(ridge, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
ridge_scores 

array([-20.82178284, -19.92767993, -20.75745556, -21.06901208,
       -20.66647163])

**Lets also try the lasso**

In [52]:
#list of alphas to test
alpha = [0.001, 0.1, 1, 2, 5]

#loop over each value of alpha, fit model, print results
for i in alpha:
    lasso = Lasso(alpha=i)
    lasso.fit(X_train, y_train)
    y_pred_lasso = lasso.predict(X_test)
    print('-'*20)
    print('Alpha: '+str(i))
    r2_lasso = sklearn.metrics.r2_score(y_test, y_pred_lasso)
    print('R^2: '+str(r2_lasso))
    neg_mean_abs_err_lasso = sklearn.metrics.mean_absolute_error(y_test, y_pred_lasso)
    print('Neg. Mean Abs. Error: '+str(neg_mean_abs_err_lasso))
    RMSE_lasso = math.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred_lasso))
    print('RMSE: '+str(RMSE_lasso))

--------------------
Alpha: 0.001
R^2: 0.0020368144345211947
Neg. Mean Abs. Error: 15.199691709677248
RMSE: 20.40552602394885
--------------------
Alpha: 0.1
R^2: -3.27384141818321e-05
Neg. Mean Abs. Error: 15.206452322861185
RMSE: 20.426673318625596
--------------------
Alpha: 1
R^2: -0.0002483024606003692
Neg. Mean Abs. Error: 15.205877083472856
RMSE: 20.428874756100438
--------------------
Alpha: 2
R^2: -0.0002483024606003692
Neg. Mean Abs. Error: 15.205877083472856
RMSE: 20.428874756100438
--------------------
Alpha: 5
R^2: -0.0002483024606003692
Neg. Mean Abs. Error: 15.205877083472856
RMSE: 20.428874756100438


**Compute cross validation scores on lasso model**

In [65]:
#calculating RMSE on lasso model using cross validation (L1 regularization)
lasso = Lasso(alpha=0.001)
lasso_scores = cross_val_score(lasso, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
lasso_scores 

array([-20.82056275, -19.92707415, -20.75680592, -21.06863568,
       -20.66599196])

**Logistic Regression Attempt (just for fun > I did not think it would perform well due to the large number of labels**

In [30]:
#build and fit model
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [31]:
y_pred = lr.predict(X_test)

In [36]:
f1score = sklearn.metrics.f1_score(y_test, y_pred, average='weighted')
f1score

0.012483017046184233