In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, make_scorer, r2_score, mean_squared_error
#from skfeature.function.similarity_based import fisher_score, reliefF, trace_ratio
#from skfeature.function.statistical_based import f_score, chi_square, gini_index
#from skfeature.function.information_theoretical_based import FCBF, JMI
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostRegressor


In [None]:

df_train_metadata = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
df_test_metadata = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')

In [None]:
import tensorflow as tf
import cv2
import missingno as msno

train_image = df_train_metadata.copy()
test_image = df_test_metadata.copy()

train_image["file_path"] = df_train_metadata["Id"].apply(lambda x: "/kaggle/input/petfinder-pawpularity-score/train/" + x + ".jpg")
test_image["file_path"] = df_test_metadata["Id"].apply(lambda x: "/kaggle/input/petfinder-pawpularity-score/test/" + x + ".jpg")

plt.figure(figsize=(20, 20))
row, col = 5, 4
for i in range(row * col):
    plt.subplot(row, col, i+1)
    image = cv2.imread(train_image.loc[i, 'file_path'])
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    target = train_image.loc[i, 'Pawpularity']
    plt.imshow(image)
    plt.title(f"No: {i}" f"   Pawpularity: {target}")
plt.show()

# def preprocess(image_url):
#   image_string = tf.io.read_file(image_url)
#   image = tf.image.decode_jpeg(image_string, channels=3)
#   image = tf.cast(image, tf.float32) / 255.0
#   image = tf.image.central_crop(image, 1.0)
#   image = tf.image.resize(image, (128, 128))
#   return image

# x_train_image=[]
# for i in train_image['file_path']:
#     x1=preprocess(i)
#     x_train_image.append(x1)

#x_train_image = pd.DataFrame(x_train_image)

#print(x_train_image)
#x_train_image.to_csv('image_features.csv')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

#Here is the business:
def image_statistics(Z):
    #Input: Z, a 2D array, hopefully containing some sort of peak
    #Output: cx,cy,sx,sy,skx,sky,kx,ky
    #cx and cy are the coordinates of the centroid
    #sx and sy are the stardard deviation in the x and y directions
    #skx and sky are the skewness in the x and y directions
    #kx and ky are the Kurtosis in the x and y directions
    #Note: this is not the excess kurtosis. For a normal distribution
    #you expect the kurtosis will be 3.0. Just subtract 3 to get the
    #excess kurtosis.
    import numpy as np

    h,w = np.shape(Z)

    x = range(w)
    y = range(h)


    #calculate projections along the x and y axes
    yp = np.sum(Z,axis=1)
    xp = np.sum(Z,axis=0)

    #centroid
    cx = np.sum(x*xp)/np.sum(xp)
    cy = np.sum(y*yp)/np.sum(yp)

    #standard deviation
    x2 = (x-cx)**2
    y2 = (y-cy)**2

    sx = np.sqrt( np.sum(x2*xp)/np.sum(xp) )
    sy = np.sqrt( np.sum(y2*yp)/np.sum(yp) )

    #skewness
    x3 = (x-cx)**3
    y3 = (y-cy)**3

    skx = np.sum(xp*x3)/(np.sum(xp) * sx**3)
    sky = np.sum(yp*y3)/(np.sum(yp) * sy**3)

    #Kurtosis
    x4 = (x-cx)**4
    y4 = (y-cy)**4
    kx = np.sum(xp*x4)/(np.sum(xp) * sx**4)
    ky = np.sum(yp*y4)/(np.sum(yp) * sy**4)


    return cx,cy,sx,sy,skx,sky,kx,ky

#We can check that the result is the same if we use the full 2D data array
def image_statistics_2D(Z):
    h,w = np.shape(Z)

    x = range(w)
    y = range(h)

    X,Y = np.meshgrid(x,y)

    #Centroid (mean)
    cx = np.sum(Z*X)/np.sum(Z)
    cy = np.sum(Z*Y)/np.sum(Z)

    ###Standard deviation
    x2 = (range(w) - cx)**2
    y2 = (range(h) - cy)**2

    X2,Y2 = np.meshgrid(x2,y2)

    #Find the variance
    vx = np.sum(Z*X2)/np.sum(Z)
    vy = np.sum(Z*Y2)/np.sum(Z)

    #SD is the sqrt of the variance
    sx,sy = np.sqrt(vx),np.sqrt(vy)

    ###Skewness
    x3 = (range(w) - cx)**3
    y3 = (range(h) - cy)**3

    X3,Y3 = np.meshgrid(x3,y3)

    #Find the thid central moment
    m3x = np.sum(Z*X3)/np.sum(Z)
    m3y = np.sum(Z*Y3)/np.sum(Z)

    #Skewness is the third central moment divided by SD cubed
    skx = m3x/sx**3
    sky = m3y/sy**3

    ###Kurtosis
    x4 = (range(w) - cx)**4
    y4 = (range(h) - cy)**4

    X4,Y4 = np.meshgrid(x4,y4)

    #Find the fourth central moment
    m4x = np.sum(Z*X4)/np.sum(Z)
    m4y = np.sum(Z*Y4)/np.sum(Z)

    #Kurtosis is the fourth central moment divided by SD to the fourth power
    kx = m4x/sx**4
    ky = m4y/sy**4

    return cx,cy,sx,sy,skx,sky,kx,ky


In [None]:
from skimage.io import imread, imshow

#df_label = pd.DataFrame(columns=['Label'])

image_features_train = pd.DataFrame(
    columns=[
             'centroid_pr_x','centroid_pr_y','stddev_pr_x','stddev_pr_y','skewness_pr_x','skewness_pr_y','kurtosis_pr_x','kurtosis_pr_y'
                ])

for i in train_image["file_path"]:
    #print('image1:',df_train['image_1'][i])
    image = cv2.imread(i)
    #print('/content/drive/MyDrive/NDSC/training_img/{}'.format(df_train['image_1'][i]))
    #print(image1)
    #imshow(image);
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    #Calculate the image statistics using the projection method
    stats_pr = image_statistics(gray_image)
    print(stats_pr)
    #Confirm that they are the same by using a 2D calculation
    #stats_2d = image_statistics_2D(gray_image)
    baris = [
             stats_pr[0], stats_pr[1], stats_pr[2], stats_pr[3], stats_pr[4], stats_pr[5], stats_pr[6], stats_pr[7]
            ]
    
    image_features_train.loc[len(image_features_train.index)] = baris

image_features_train.to_csv ('/kaggle/working/image_features_train.csv')

In [None]:
from skimage.io import imread, imshow

#df_label = pd.DataFrame(columns=['Label'])

image_features_test = pd.DataFrame(
    columns=[
             'centroid_pr_x','centroid_pr_y','stddev_pr_x','stddev_pr_y','skewness_pr_x','skewness_pr_y','kurtosis_pr_x','kurtosis_pr_y'
                ])

for i in test_image["file_path"]:
    #print('image1:',df_train['image_1'][i])
    image = cv2.imread(i)
    #print('/content/drive/MyDrive/NDSC/training_img/{}'.format(df_train['image_1'][i]))
    #print(image1)
    #imshow(image);
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    #Calculate the image statistics using the projection method
    stats_pr = image_statistics(gray_image)
    print(stats_pr)
    #Confirm that they are the same by using a 2D calculation
    #stats_2d = image_statistics_2D(gray_image)
    baris = [
             stats_pr[0], stats_pr[1], stats_pr[2], stats_pr[3], stats_pr[4], stats_pr[5], stats_pr[6], stats_pr[7]
            ]
    
    image_features_test.loc[len(image_features_test.index)] = baris

image_features_test.to_csv('/kaggle/working/image_features_test.csv')

In [None]:
image_features_train = pd.read_csv('/kaggle/working/image_features_train.csv')
image_features_test = pd.read_csv('/kaggle/working/image_features_test.csv')

In [None]:
df_train = pd.concat([df_train_metadata['Pawpularity'], df_train_metadata.loc[:,'Subject Focus':'Blur'],image_features_train.loc[:,'centroid_pr_x':'kurtosis_pr_y']], axis=1)
df_test = pd.concat([df_test_metadata.loc[:,'Subject Focus':'Blur'],image_features_test.loc[:,'centroid_pr_x':'kurtosis_pr_y']], axis=1)

df_test.head(3)

In [None]:

label = np.asarray(df_train['Pawpularity'])
features = np.asarray(df_train.loc[:, 'Subject Focus':'kurtosis_pr_y'])

scaler = preprocessing.MinMaxScaler(feature_range=(0,10)).fit(features)
scaled_feature = scaler.transform(features)

ranked_index = [12, 8, 13, 14, 15, 16, 17, 18, 19, 10,  9,  1,  6,  2,  3,  7, 11,  5,  4,  0,  0]

result = scaled_feature[:, ranked_index[:]]

print("\nJMI")
print(ranked_index)
print(result)

In [None]:
def rmse(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)

    distance = predict - actual

    square_distance = distance ** 2

    mean_square_distance = square_distance.mean()

    score = np.sqrt(mean_square_distance)

    return score

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

jumlah_fitur = range(1,scaled_feature.shape[1]+1)
scores = []
score = 1000
best_score = 1000
best_feature_number = 0

#Mencoba optimasi hyperparameter untuk setiap kombinasi/jumlah fitur
for jumlah_fitur_terbaik in jumlah_fitur:
    #print(jumlah_fitur_terbaik)
    selected_features = result[:,0:jumlah_fitur_terbaik]
    #split data training dan data testing
    X_train, X_test, y_train, y_test = train_test_split(selected_features, label, test_size=0.3, random_state=0)

    regressor = KNeighborsRegressor()
    
    # optimasi hyperparameter
    param_grid = [
    {'n_neighbors':[3,5,7,9,11,13,15], 'metric':['euclidean','manhattan','chebyshev','minkowski','wminkowski','seuclidean','mahalanobis']}
    #{'C': [1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
    #{'n_estimators': [50, 100, 150, 200], 'criterion':['gini', 'entropy'], 'max_depth':[5, 10, 15], 'min_samples_split':[0.1, 1.0, 10], 'min_samples_leaf':[0.1, 0.5, 5]}
    #{'criterion':['gini', 'entropy'], 'max_depth':[5, 10, 15], 'min_samples_split':[0.1, 1.0, 10], 'min_samples_leaf':[0.1, 0.5, 5]}
    #{'n_estimators': [50, 100, 150, 200],'learning_rate': [0.1,0.2,0.3],}
    ]
    
    #menentukan prioritas scoring menggunakan apa (accuracy/precision/recall, dll)
    metric = make_scorer(mean_squared_error, greater_is_better=False)

    model = GridSearchCV(regressor, param_grid, scoring=metric, cv=5, refit = True, verbose = 3)

    # fitting the model for grid search 
    model.fit(X_train, y_train)

    # print best parameter after tuning 
    print(model.best_params_) 
      
    # print how our model looks after hyper-parameter tuning 
    print(model.best_estimator_)

    #model_predictions = model.predict(X_test) 

    #model.fit(X_train, y_train)
    score = abs(model.score(X_test, y_test))
    scores.append(score)

    #menentukan model terbaik berdasarkan score terbaik menggunakan kombinasi jumlah fitur dan optimasi hyperparameter
    if(best_score > score):
      best_score = score
      best_model = model
      best_feature_number = jumlah_fitur_terbaik
      best_parameter = model.best_params_
      
      #menyimpan best_X_test dengan jumlah fitur terbaik
      best_X_test = X_test


plt.figure()
plt.xlabel('jumlah_fitur_terbaik')
plt.ylabel('score')
plt.scatter(jumlah_fitur, scores)
plt.grid()

print(scores);
print('Jumlah fitur terbaik : ',best_feature_number)
print('Score terbaik : ',best_score)
print('Parameter terbaik : ',best_parameter)

final_predictions = best_model.predict(best_X_test) 

In [None]:
from sklearn.metrics import r2_score

fig, ax = plt.subplots()
ax.text(1, 9.5,'$R^2=$'+str(round(r2_score(y_test, final_predictions),4)), fontsize=12, verticalalignment='top', multialignment='center')
ax.text(1, 9,'$MSE=$'+str(round(rmse(y_test, final_predictions),4)), fontsize=12, verticalalignment='top', multialignment='center')

ax.set_xlim(xmin=1)
ax.set_ylim(ymin=1)
ax.set_xlim(xmax=100)
ax.set_ylim(ymax=100)

ax.set_xlabel('Actual Value', fontsize=14)
ax.set_ylabel('Predicted Value', fontsize=14)
ax.scatter(y_test, final_predictions, s=100, c=y_test, cmap='viridis')

lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
    np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
]

# now plot both limits against eachother
ax.plot(lims, lims, 'r--', alpha=0.75, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)
ax.grid(True, which='both')

xvalue = np.linspace(1,10,10)
lsigma = ax.fill_between(xvalue, xvalue+1, xvalue-1, color='blue', alpha=0.3)

plt.show()

In [None]:
features_test = np.asarray(df_test.loc[:, "Subject Focus":"kurtosis_pr_y"])
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(features_test)
features_test = scaler.transform(features_test)

result_test = features_test[:, ranked_index[:]]
selected_features_kaggle_test = result_test[:, 0:best_feature_number]

final_predictions_kaggle = best_model.predict(selected_features_kaggle_test) 
data_test = np.array(df_test_metadata.Id)
# print(len(final_predictions_kaggle))
# print(features_test)
# print(result_test)
# print(data_test)

df_hasil = pd.DataFrame({"Id":data_test,"Pawpularity":final_predictions_kaggle})

df_hasil.to_csv('submission.csv', index=False)
df_hasil.head()