In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')

In [None]:
df_train.head(5)

In [None]:
df_train.isna().sum()

In [None]:
#summarizing Data
df_train.describe()



### We will first explore the target variables distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,5))
plt.xlim(0,100)
plt.xlabel('Pawpularity')
plt.ylabel('Frequency')
sns.distplot(df_train['Pawpularity'],bins=100,kde=False)
plt.show()
#Mean lies nearly 38

### Let us check what each of those fields represent visually

In [None]:
#Randomely choose images where following features are on
import random

file_where_face_is_on = random.choice(df_train.loc[df_train['Face'] == 1, 'Id'].tolist())
file_where_eye_is_on = random.choice(df_train.loc[df_train['Eyes'] == 1, 'Id'].tolist())
file_where_near_is_on = random.choice(df_train.loc[df_train['Near'] == 1, 'Id'].tolist())
file_where_action_is_on = random.choice(df_train.loc[df_train['Action'] == 1, 'Id'].tolist())
file_where_accessory_is_on = random.choice(df_train.loc[df_train['Accessory'] == 1, 'Id'].tolist())
file_where_group_is_on = random.choice(df_train.loc[df_train['Group'] == 1, 'Id'].tolist())
file_where_collage_is_on = random.choice(df_train.loc[df_train['Collage'] == 1, 'Id'].tolist())
file_where_human_is_on = random.choice(df_train.loc[df_train['Human'] == 1, 'Id'].tolist())
file_where_occlusion_is_on = random.choice(df_train.loc[df_train['Occlusion'] == 1, 'Id'].tolist())
file_where_info_is_on = random.choice(df_train.loc[df_train['Info'] == 1, 'Id'].tolist())
file_where_blur_is_on = random.choice(df_train.loc[df_train['Blur'] == 1, 'Id'].tolist())

file_with_features = [file_where_face_is_on,file_where_eye_is_on,file_where_near_is_on,file_where_action_is_on,file_where_accessory_is_on,file_where_group_is_on,file_where_collage_is_on,file_where_human_is_on,file_where_occlusion_is_on,file_where_info_is_on,file_where_blur_is_on]
file_with_features_with_name = ['file_where_face_is_on','file_where_eye_is_on','file_where_near_is_on','file_where_action_is_on','file_where_accessory_is_on','file_where_group_is_on','file_where_collage_is_on','file_where_human_is_on','file_where_occlusion_is_on','file_where_info_is_on','file_where_blur_is_on']
file_path = '../input/petfinder-pawpularity-score/train'


In [None]:
from matplotlib import pyplot
from matplotlib.image import imread
k = 0
for i,file in enumerate(file_with_features):
    file = file_path+'/'+file+'.jpg'
    k=k+1
    plt.figure(figsize=(30,30))
    plt.subplot(6,6,k) 
    plt.title(file_with_features_with_name[i])
    plt.imshow(imread(file))
    
    
plt.show()


### Now we know what each of those images mean let us check what is the distribution of scores with respect to different features

In [None]:
y_col = 'Pawpularity'
x_col = ['Face','Eyes','Near','Action','Accessory','Group','Collage','Human','Occlusion','Info','Blur']


import seaborn as sns
fig, axes = plt.subplots(11,1,figsize=(20,20))
                        
sns.set_style("darkgrid")
for i,t in enumerate(x_col):
    sns.boxplot(y=y_col, x= t, data=df_train, orient='v',hue=t, ax=axes[i], palette='Dark2')

### This is do which pattern occurs frequently

In [None]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
#Just get the features which has onehot
one_hot = df_train.drop(['Id', 'Pawpularity'], axis = 1)

In [None]:
# Building the model
frq_items = apriori(one_hot, min_support = 0.05, use_colnames = True)
 
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift", min_threshold = 1)
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False])
print(rules.head(20))

In [None]:
rules.to_csv('Rules.csv')

### Let us try to train a model which can do regression

In [None]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance

In [None]:
X = df_train.drop(['Id', 'Pawpularity'], axis = 1)
y = df_train['Pawpularity']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=5)

In [None]:
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)

In [None]:
model = cb.CatBoostRegressor(loss_function='RMSE')

In [None]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)

In [None]:
pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
print("Testing performance")
print('RMSE {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

In [None]:
sorted_feature_importance = model.feature_importances_.argsort()
plt.barh(df_train.feature_names[sorted_feature_importance], 
        model.feature_importances_[sorted_feature_importance], 
        color='turquoise')
plt.xlabel("CatBoost Feature Importance")