For linear models, we have a few options - linear regression, logistic regrssion, and linear disc. analysis

Lets try linear regression due to the large number of labels. If we go with logistic regression, we are going to have to run a lot of one vs. all because it is binary. Tuning the hyperparameters could turn into a nightmare on top of one vs. all

In [43]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [44]:
#import the data
grayscale_data = pd.read_csv('petfinder-pawpularity-score/meta_image_gray_data.csv')

#remove the unnamed column
grayscale_data.pop('Unnamed: 0')

0          0
1          1
2          2
3          3
4          4
        ... 
9907    9907
9908    9908
9909    9909
9910    9910
9911    9911
Name: Unnamed: 0, Length: 9912, dtype: int64

In [45]:
grayscale_data.head()

Unnamed: 0,Pawpularity,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,...,p8091,p8092,p8093,p8094,p8095,p8096,p8097,p8098,p8099,p8100
0,63,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,...,187.0,188.0,183.0,185.0,180.0,176.0,175.0,180.0,187.0,196.0
1,42,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,...,128.0,125.0,124.0,123.0,122.0,124.0,126.0,127.0,131.0,132.0
2,28,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,...,100.0,97.0,91.0,90.0,100.0,94.0,88.0,93.0,92.0,83.0
3,15,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,...,101.0,105.0,112.0,121.0,130.0,140.0,147.0,155.0,156.0,137.0
4,72,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,...,222.0,229.0,232.0,234.0,230.0,229.0,232.0,234.0,236.0,228.0


In [46]:
#split the data into train and test split so that we can still test our hyperparameters
gray_df_train, gray_df_test = train_test_split(grayscale_data, train_size=0.8, test_size=0.2, random_state=10)

In [47]:
#rescale features
scaler = MinMaxScaler()

In [48]:
#scale all of the numeric features, which are only image data

#run a for loop to generate a list of column names to scale
gray_num_column_ids = []
for i in range(8100):
    i_str = str(i+1)
    gray_num_column_ids.append('p'+i_str)
    
gray_df_train[gray_num_column_ids] = scaler.fit_transform(gray_df_train[gray_num_column_ids])
gray_df_train.head()

Unnamed: 0,Pawpularity,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,...,p8091,p8092,p8093,p8094,p8095,p8096,p8097,p8098,p8099,p8100
6916,6,b28b784d0d79c33b413f3554af315dec,0,1,1,1,0,0,0,0,...,0.176471,0.184314,0.184314,0.176471,0.176471,0.180392,0.172549,0.168627,0.176471,0.184314
5837,65,972fe89216702f65d554f8701c184f08,0,1,1,1,0,0,0,0,...,0.478431,0.45098,0.466667,0.627451,0.701961,0.737255,0.721569,0.67451,0.639216,0.588235
2600,23,43426c09b9bec6a93fe3bf1ed20f2d33,0,1,1,1,0,0,0,0,...,0.788235,0.784314,0.792157,0.796078,0.8,0.788235,0.8,0.788235,0.780392,0.788235
2167,20,37cd6b655c64771d389cc38c194e12ba,0,0,0,0,1,0,1,0,...,0.470588,0.478431,0.454902,0.447059,0.482353,0.470588,0.466667,0.462745,0.509804,0.521569
7026,2,b517a911a6c7b9ef84b7b94813103327,0,1,1,1,0,0,0,0,...,0.882353,0.886275,0.898039,0.898039,0.898039,0.905882,0.909804,0.917647,0.92549,0.913725


In [49]:
#also scale the test data
gray_df_test[gray_num_column_ids] = scaler.fit_transform(gray_df_test[gray_num_column_ids])
gray_df_test.head()

Unnamed: 0,Pawpularity,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,...,p8091,p8092,p8093,p8094,p8095,p8096,p8097,p8098,p8099,p8100
9161,100,ecbb48fc9d345f6e2b03deaf8f1645f0,0,0,1,1,0,0,0,0,...,0.807843,0.8,0.8,0.796078,0.792157,0.788235,0.784314,0.784314,0.776471,0.776471
9695,24,fa4a3d69e1e0e21b62bb33538bc54e61,0,1,1,1,0,0,0,0,...,0.360784,0.321569,0.282353,0.270588,0.27451,0.266667,0.258824,0.262745,0.262745,0.262745
9033,76,e97d059f75a50e9c9805b0dba4d0d84e,0,0,1,1,0,0,0,0,...,0.439216,0.435294,0.427451,0.458824,0.501961,0.470588,0.443137,0.415686,0.392157,0.360784
4617,29,76420f02afab76d2a6eab95efc816347,1,1,1,1,0,0,0,0,...,0.560784,0.533333,0.415686,0.427451,0.45098,0.454902,0.454902,0.45098,0.443137,0.431373
4220,21,6bb7f0653725b30118199fd763945713,0,0,1,1,0,0,0,0,...,0.082353,0.066667,0.141176,0.133333,0.121569,0.078431,0.117647,0.113725,0.105882,0.101961


In [50]:
#split data frames into X_train, y_train, X_test, y_test
y_train = gray_df_train.pop('Pawpularity')
X_train = gray_df_train
X_train.pop('Id')

y_test = gray_df_test.pop('Pawpularity')
X_test = gray_df_test
X_test.pop('Id')

9161    ecbb48fc9d345f6e2b03deaf8f1645f0
9695    fa4a3d69e1e0e21b62bb33538bc54e61
9033    e97d059f75a50e9c9805b0dba4d0d84e
4617    76420f02afab76d2a6eab95efc816347
4220    6bb7f0653725b30118199fd763945713
                      ...               
6500    a8028d608d5a1916c5482616e5838b6c
8934    e6f6665772e5e67240d46d899e01ad78
2756    4708f6747261af730735ff0272cfc73e
3508    5a7fa7cfeb8d5e32116574e6be7ecb6a
3923    6411e12dd43ef887ac45984c01ebf850
Name: Id, Length: 1983, dtype: object

In [51]:
X_train.head()

Unnamed: 0,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,...,p8091,p8092,p8093,p8094,p8095,p8096,p8097,p8098,p8099,p8100
6916,0,1,1,1,0,0,0,0,0,0,...,0.176471,0.184314,0.184314,0.176471,0.176471,0.180392,0.172549,0.168627,0.176471,0.184314
5837,0,1,1,1,0,0,0,0,0,0,...,0.478431,0.45098,0.466667,0.627451,0.701961,0.737255,0.721569,0.67451,0.639216,0.588235
2600,0,1,1,1,0,0,0,0,0,0,...,0.788235,0.784314,0.792157,0.796078,0.8,0.788235,0.8,0.788235,0.780392,0.788235
2167,0,0,0,0,1,0,1,0,0,1,...,0.470588,0.478431,0.454902,0.447059,0.482353,0.470588,0.466667,0.462745,0.509804,0.521569
7026,0,1,1,1,0,0,0,0,0,0,...,0.882353,0.886275,0.898039,0.898039,0.898039,0.905882,0.909804,0.917647,0.92549,0.913725


In [52]:
y_train.head()

6916     6
5837    65
2600    23
2167    20
7026     2
Name: Pawpularity, dtype: int64

In [53]:
#Create a linear regression model
lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()

In [55]:
#above cell started at 7:11PM and ended around 7:20
y_pred = lm.predict(X_test)

r2 = sklearn.metrics.r2_score(y_test, y_pred)
print(r2)

RMSE = sklearn.metrics.mean_squared_error(y_test, y_pred)
print(RMSE)

-63.813542585618364
27042.499391236186


So obviously the linear regression model does not perform well without cross-validation. Next, try it with cross validation, but I am not hopeful that it will work

In [41]:
R2_scores = cross_val_score(lm, X_train, y_train, scoring='r2', cv=5)
RMSE_scores = cross_val_score(lm, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5)
print('R2 Scores:')
R2_scores
print('RMSE Scores:')
RMSE_scores

array([-8.12420679, -9.13295824, -7.79033803, -8.69667594, -8.11275074])

In [None]:
#started above cell at 7:34, ended at 7:50

In [42]:
y_pred

array([ 160.19190682, -180.81195181,   -7.05199884, ...,  104.45837291,
        -59.18401841,  108.29169971])

**References:**

https://www.kaggle.com/jnikhilsai/cross-validation-with-linear-regression

https://towardsdatascience.com/linear-regression-made-easy-how-does-it-work-and-how-to-use-it-in-python-be0799d2f159