In [None]:
# data analysis and wrangling
import math
import pandas as pd
import numpy as np
import random as rnd
from enum import Enum

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline

# machine learning
import sklearn
import sklearn.linear_model as linear_model
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

random_state = 131

In [None]:
input_path = '../input/petfinder-pawpularity-score/'
train_data = pd.read_csv(input_path + 'train.csv')
test_data = pd.read_csv(input_path + 'test.csv')

y = train_data["Pawpularity"]
X = train_data.drop(['Id','Pawpularity'],axis=1)

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.2, random_state=random_state)

final_Id = test_data["Id"]
final_test = test_data.drop(['Id'],axis=1)

# 1. Introduction

*Note: Submit version for class 520. Don't output test data to submit.csv.*

We got a target variable called “The Pawpularity Score”, which is a 1 - 100 integer derived from each pet profile's page view statistics at the listing pages. We want to do supervised learning to learn a model to predict the pawpularity score, given any image of a pet and the image’s metadata.

The problem can work as either a classification problem or regression problem, for it can be fit into a categorical of 1-100. We can put it into bins with a certain step to improve performance.

Root Mean Square Error (RMSE) metric $\textrm{RMSE} = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2}$ is used as the measurement, and it’s also the score for this contest.

# 2. Data Exploration

For the data given to us, we have n=9912 lines, each has p = 12(binary metadata attributes) + 1(image). For prediction, we have the pawpularity score, which is a integer from 1-100. Images are different dimensions, all coloured jpegs.

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
for i in range(4):
    img_id = train_data['Id'][i] 
    img = plt.imread(input_path + 'train/'+ img_id + '.jpg')
    plt.imshow(img)
    plt.title('Pawpularity:{}'.format(train_data['Pawpularity'][i]))
    plt.show()

In [None]:
sns.histplot(data=y).set_title('Pawpularity score Histogram')

In [None]:
print(train_data.corr()['Pawpularity'])

In [None]:
sns.heatmap(train_data.corr(), cmap='coolwarm', square=True).set_title('Correlation Matrix')

In [None]:
y.describe()

# 3. Model
We will try to train models on meta data only and picture data only first, then combine the result to see if we can gain a better result.

## 3.1 Metadata

- Median
- Linear Regression
- Ridge Regression
- KNN Classification
- Decision Tree Regression
- Decision Tree Classification
- Random Forest Regression
- Random Forest Classification
- AdaBoost Regression
- AdaBoost Classification

---------

### Prepare data

In [None]:
# define which method is used to submit the final code
class Method(Enum):
    No_submit = 0
    Median = 1
    Linear_Reg = 2
    Ridge_Reg = 3
    KNN_Cla = 4
    DT_Reg = 5
    DT_Cla = 6
    RF_Reg = 7
    RF_Cla = 8
    AB_Reg = 9
    AB_Cla = 10
    Auto_Reg = 9
    Auto_Cla = 10
submit_method = Method.No_submit

# ---------------------------------------------------------------

# Some helper function
def into_class(y):
#     return y
    return [int(a) for a in ((y + 5) / 10)]

def recover_class(y):
#     return y
    return [a * 10 for a in y]
    
def RMSE(y, y_pred):
    return math.sqrt(mean_squared_error(y, y_pred))

def Model_Train_Reg(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    acc = RMSE(preds, y_test)
    return acc

def Model_Get_Submission_Reg(model, final_test):
    # calculate result
    final_result = model.predict(final_test)
    submission = pd.DataFrame({
            "Id": final_Id,
            "Pawpularity": final_result
        })
    return submission

def Model_Train_Cla(model, x_train, y_train, x_test, y_test):
    y_train_class = into_class(y_train)
    model.fit(x_train, y_train_class)
    proba = np.array(model.predict_proba(x_test))
    preds = proba@np.array(range(np.shape(proba)[1]))
    preds = recover_class(preds)
    acc = RMSE(preds, y_test)
    return acc

def Model_Get_Submission_Cla(model, final_test):
    # calculate result
    final_result = recover_class(model.predict(final_test))
    submission = pd.DataFrame({
            "Id": final_Id,
            "Pawpularity": final_result
        })
    return submission

In [None]:
input_path = '../input/petfinder-pawpularity-score/'
train_data = pd.read_csv(input_path + 'train.csv')
test_data = pd.read_csv(input_path + 'test.csv')

y = train_data["Pawpularity"]
X = train_data.drop(['Id','Pawpularity'],axis=1)

x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2, random_state=random_state)

final_Id = test_data["Id"]
final_test = test_data.drop(['Id'],axis=1)
acc_final = 0

### Model

#### Median

In [None]:
median = np.median(y_train)
acc_median = RMSE([median] * len(list(y_test)), y_test)
print("Median RMSE:", acc_median)
if submit_method is Method.Median:
    # calculate result
    submission = pd.DataFrame({
            "Id": final_Id,
            "Pawpularity": median
        })
    acc_final = acc_median

#### Linear Regression

In [None]:
model = linear_model.LinearRegression()
acc_linear_reg = Model_Train_Reg(model, x_train, y_train, x_test, y_test)
print('Linear Regression RMSE:', acc_linear_reg)

if submit_method is Method.Linear_Reg:
    submission = Model_Get_Submission_Reg(model, final_test)
    acc_final = acc_linear_reg

#### Ridge Regression

In [None]:
model = linear_model.Ridge()
acc_ridge_reg = Model_Train_Reg(model, x_train, y_train, x_test, y_test)
print('Ridge Regression RMSE:', acc_linear_reg)

if submit_method is Method.Ridge_Reg:
    submission = Model_Get_Submission_Reg(model, final_test)
    acc_final = acc_ridge_reg

#### KNN Classification

In [None]:
model = KNeighborsClassifier(n_neighbors = 10)
acc_KNN_cla = Model_Train_Cla(model, x_train, y_train, x_test, y_test)
print('KNN Classification RMSE:', acc_KNN_cla)

if submit_method is Method.KNN_Cla:
    submission = Model_Get_Submission_Cla(model, final_test)
    acc_final = acc_KNN_cla

#### Decision Tree Regression

In [None]:
model =  DecisionTreeRegressor(max_depth = 3, min_samples_split = 10)
acc_DT_reg = Model_Train_Reg(model, x_train, y_train, x_test, y_test)
print('Ridge Regression RMSE:', acc_DT_reg)

if submit_method is Method.DT_Reg:
    submission = Model_Get_Submission_Reg(model, final_test)
    acc_final = acc_ridge_reg

#### Decision Tree Classification

In [None]:
model = DecisionTreeClassifier(max_depth = 3, min_samples_split = 10)
acc_DT_cla = Model_Train_Cla(model, x_train, y_train, x_test, y_test)
print('Decision Tree Classification RMSE:', acc_DT_cla)

if submit_method is Method.DT_Cla:
    submission = Model_Get_Submission_Cla(model, final_test)
    acc_final = acc_DT_cla

#### Random Forest Regression

In [None]:
model =  RandomForestRegressor(n_estimators=100, max_depth = 3, min_samples_split = 10)
acc_RF_reg = Model_Train_Reg(model, x_train, y_train, x_test, y_test)
print('Ridge Regression RMSE:', acc_RF_reg)

if submit_method is Method.RF_Reg:
    submission = Model_Get_Submission_Reg(model, final_test)
    acc_final = acc_RF_reg

In [None]:
# Cause the RandomForestRegressor seems works best, we try to record the result of it

y_test_predict_meta = model.predict(x_test)

#let's see what our predictions look like vs the actual 
def ActualvPredictionsGraph(y_test,y_pred,title):
    if max(y_test) >= max(y_pred):
        my_range = int(max(y_test))
    else:
        my_range = int(max(y_pred))
    plt.figure(figsize=(12,3))
    plt.scatter(range(len(y_test)), y_test, color='blue')
    plt.scatter(range(len(y_pred)), y_pred, color='red')
    plt.xlabel('Index ')
    plt.ylabel('Pawpularity ')
    plt.title(title,fontdict = {'fontsize' : 15})
    plt.legend(handles = [mpatches.Patch(color='red', label='prediction'),mpatches.Patch(color='blue', label='actual')])
    plt.show()
    return

y_pred = model.predict(x_test)
#plot it
ActualvPredictionsGraph(y_test[0:200], y_pred[0:200], "First 200 Actual v. Predicted for Random Forest Regression")
ActualvPredictionsGraph(y_test, y_pred, "All Actual v. Predicted")

#### Random Forest Classification

In [None]:
model = RandomForestClassifier(n_estimators=100, max_depth = 3, min_samples_split = 10)
acc_RF_cla = Model_Train_Cla(model, x_train, y_train, x_test, y_test)
print('Decision Tree Classification RMSE:', acc_RF_cla)

if submit_method is Method.RF_Cla:
    submission = Model_Get_Submission_Cla(model, final_test)
    acc_final = acc_RF_cla

#### AdaBoost Regression

In [None]:
model = AdaBoostRegressor(learning_rate = 0.01, n_estimators = 10)
acc_AB_reg = Model_Train_Reg(model, x_train, y_train, x_test, y_test)
print('AdaBoost Regression RMSE:', acc_AB_reg)

if submit_method is Method.AB_Reg:
    submission = Model_Get_Submission_Reg(model, final_test)
    acc_final = acc_AB_reg

#### AdaBoost Classification

In [None]:
model = AdaBoostClassifier(learning_rate = 0.01, n_estimators = 10)
acc_AB_cla = Model_Train_Cla(model, x_train, y_train, x_test, y_test)
print('AdaBoost Classification RMSE:', acc_AB_cla)

if submit_method is Method.AB_Cla:
    submission = Model_Get_Submission_Cla(model, final_test)
    acc_final = acc_AB_cla

### Anysis

In [None]:
models = pd.DataFrame({
    'Model': ['Median', 'Linear Regression', 'Ridge Regression', 'KNN Classification', 'Decision Tree Regression',
              'Decision Tree Classification', 'Random Forest Regression', 'Random Forest Classification',
             'AdaBoost Regression', 'AdaBoost Classification'],
    'Score': [acc_median, acc_linear_reg, acc_ridge_reg, acc_KNN_cla, acc_DT_reg, acc_DT_cla, acc_RF_reg, acc_RF_cla,
             acc_AB_reg, acc_AB_cla]})
models.sort_values(by='Score', ascending=True)

## 3.2 Picture Data

In [None]:
import cv2
# for evaluating the model
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# CNN
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam, SGD

from skimage.transform import resize

random_state = 131

In [None]:
input_path = '../input/petfinder-pawpularity-score/'
train_data = pd.read_csv(input_path + 'train.csv')
test_data = pd.read_csv(input_path + 'test.csv')

def getImg(img_id, dire):
    # reading the image
    path = input_path + dire + '/'+ img_id + '.jpg'
    image = cv2.imread(path)
    image = cv2.resize(image, (28, 28))
    img = image.astype(np.float32) / 255.0 # converting the type of pixel to float 32
    img = np.moveaxis(img, -1, 0)
    return img

load_img = []
for img_id in tqdm(train_data['Id']):
    # appending the image into the list
    load_img.append(getImg(img_id, 'train'))

X = np.array(load_img)
y = train_data["Pawpularity"]

x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2, random_state=random_state)

load_img = []
for img_id in tqdm(test_data['Id']):
    # appending the image into the list
    load_img.append(getImg(img_id, 'test'))

final_Id = test_data["Id"]
final_test = np.array(load_img)
acc_final = 0

In [None]:
print(np.shape(x_train))
# converting training images into torch format
x_train  = torch.from_numpy(np.array(x_train))
x_test  = torch.from_numpy(np.array(x_test))
final_test  = torch.from_numpy(np.array(final_test))

# converting the target into torch format
y_train = np.array(y_train).reshape(len(y_train), 1)
y_train = torch.from_numpy(y_train)
y_test = np.array(y_test).reshape(len(y_test), 1)
y_test = torch.from_numpy(y_test)

# shape of training data
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
class CNN(torch.nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv_1 = torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.max_pool2d = torch.nn.MaxPool2d(kernel_size=2, stride=2)
        self.linear_1 = torch.nn.Linear(14 * 14 * 32, 128)
        self.linear_2 = torch.nn.Linear(128, 1)
        self.dropout = torch.nn.Dropout(p=0.5)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv_1(x))
        x = self.max_pool2d(x)
        x = x.reshape(x.size(0), -1)
        x = self.relu(self.linear_1(x))
        x = self.dropout(x)
        x = self.linear_2(x)
        return x

In [None]:
epochs = 200
model = CNN()
criterion = torch.nn.MSELoss()
optimizer = Adam(model.parameters(), lr=1e-3) 

x_train_temp, y_train_temp = Variable(x_train), Variable(y_train)
x_test_temp, y_test_temp = Variable(x_test), Variable(y_test)

# checking if GPU is available
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()
    x_train_temp = x_train_temp.cuda()
    y_train_temp = y_train_temp.cuda()
    x_test_temp = x_test_temp.cuda()
    y_test_temp = y_test_temp.cuda()
x_train_temp = x_train_temp.to(torch.float32)
y_train_temp = y_train_temp.to(torch.float32)
x_test_temp = x_test_temp.to(torch.float32)
y_test_temp = y_test_temp.to(torch.float32)

train_loss = []
# traing iteration
for epoch in range(epochs):
    if torch.cuda.is_available():
        x_train = x_train.cuda()
        y_train = y_train.cuda()
    running_loss = 0
    # zero gradient
    optimizer.zero_grad()
    # forward path
    y_predicted = model(x_train_temp)
    y_predicted = y_predicted.to(torch.float32)
    loss = criterion(y_predicted, y_train_temp)
    running_loss = loss.item()
    # backpropagating
    loss.backward()
    # optimizes the weights
    optimizer.step()
    train_loss.append(running_loss)
    if (epoch+1) % 20 == 0:
        y_test_predict = model(x_test_temp)
        y_test_predict = y_test_predict.to(torch.float32)
        loss_test = criterion(y_test_predict, y_test_temp).item()
        print(f'epoch: {epoch+1}, loss: {np.sqrt(running_loss):.4f}, test_loss: {np.sqrt(loss_test):.4f}' )

In [None]:
y_test_predict_cnn = model(x_test_temp).cpu().detach().numpy().reshape(-1)


## 3.3 Combination

In [None]:
print(np.array(y_test_predict_cnn))
print(np.array(y_test_predict_meta))
y_test_average = (y_test_predict_cnn + y_test_predict_meta) / 2
acc_average = RMSE(y_test, y_test_average)
print('Average RMSE:', acc_average)

## 3.4 Conclusion
analyze the performance and output result to submit.csv

# Reference

https://www.kaggle.com/danielkorth/quick-pawpularity-eda/notebook

https://www.kaggle.com/alexteboul/tutorial-part-2-model-building-using-the-metadata#Tutorial-Part-2:-Model-Building-using-the-Metadata