# Understanding different regression models with Pycaret!!

![Pycaret](https://miro.medium.com/max/1400/1*Cku5-rqmqSIuhUyFkIAdIA.png)

In [None]:
import os
import random

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# Asthetics
import warnings
import sklearn.exceptions
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

# Tabular data file paths
TRAIN_DATA_PATH = '../input/petfinder-pawpularity-score/train.csv'
TEST_DATA_PATH = '../input/petfinder-pawpularity-score/test.csv'


TARGET_NAME = 'Pawpularity'
VAL_SIZE = 0.15
SEED = 2021
EARLY_ROUNDS = 50

In [None]:
def set_seed(seed=42):
    """Utility function to use for reproducibility.
    :param seed: Random seed
    :return: None
    """
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
def set_display():
    """Function sets display options for charts and pd.DataFrames.
    """
    # Plots display settings
    plt.style.use('fivethirtyeight')
    plt.rcParams['figure.figsize'] = 12, 8
    plt.rcParams.update({'font.size': 14})
    # DataFrame display settings
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.options.display.float_format = '{:.4f}'.format
    
    
def get_features(df: pd.DataFrame) -> list:
    """Function selects input features from a DataFrame.
    :param df: DataFrame containing features, Ids and possibly target values
    :return: List of input features
    """
    return [column for column in df.columns
            if column != 'Id' and column != TARGET_NAME]


def add_features(df: pd.DataFrame) -> pd.DataFrame:
    """Function adds new features to the DataFrame
    by summing up existing features. Uses variable "features"
    defined outside the scope of this function.
    :param df: Original DataFrame
    :return: Updated DataFrame
    """
    # Normalized sum of all original features
    df['features_sum'] = df[features].sum(axis=1) / len(features)

    # Feature pairs (normalized)
    for i in range(len(features) - 1):
        for j in range(i + 1, len(features)):
            feature_1 = features[i]
            feature_2 = features[j]
            df[f'{feature_1}_{feature_2}'] = (df[feature_1] + df[feature_2]) / 2

    # Feature triplets (normalized)
    for i in range(len(features) - 2):
        for j in range(i + 1, len(features) - 1):
            for z in range(j + 1, len(features)):
                feature_1 = features[i]
                feature_2 = features[j]
                feature_3 = features[z]
                df[f'{feature_1}_{feature_2}_{feature_3}'] = (
                    df[feature_1] + df[feature_2] + df[feature_3]) / 3

    return df
    
    
    
set_seed(SEED)
set_display()

# Train data set
data_train = pd.read_csv(TRAIN_DATA_PATH)
print(f'Train data shape: {data_train.shape}')
data_train.head()

In [None]:
# Test data set
data_test = pd.read_csv(TEST_DATA_PATH)
print(f'Test data shape: {data_test.shape}')
data_test.head()

In [None]:
# Distribution of the target values
print(f'Target values: {data_train[TARGET_NAME].min()} - {data_train[TARGET_NAME].max()}\n'
      f'Mean value: {data_train[TARGET_NAME].mean()}\n'
      f'Median value: {data_train[TARGET_NAME].median()}\n'
      f'Standard deviation: {data_train[TARGET_NAME].std()}')

sns.histplot(data=data_train, x=TARGET_NAME, kde=True)
plt.axvline(data_train[TARGET_NAME].mean(), c='orange', ls='-', lw=3, label='Mean')
plt.axvline(data_train[TARGET_NAME].median(), c='green', ls='-', lw=3, label='Median')
plt.legend()
plt.title('Pawpularity Score')
plt.tight_layout()
plt.show()

In [None]:
data_train.head()

In [None]:
# List of original input features
# features = get_features(data_train)

# # Add new features
# data_train = add_features(data_train)
# data_test = add_features(data_test)
# data_train.head()

In [None]:
!pip -qq install pycaret

from pycaret.regression import *

In [None]:
ignore_feature = ['Id']
reg = setup(data = data_train, 
             target = 'Pawpularity',
             numeric_imputation = 'mean',
             categorical_features = []  , 
             ignore_features = ignore_feature,
             normalize = True,
             silent = True,
           create_clusters=True)

In [None]:
compare_models()

In [None]:
cb = create_model('lightgbm')

### SHAP values for LGBM

In [None]:
interpret_model(cb)

In [None]:
cb = create_model('huber')

In [None]:
predictions = predict_model(cb, data = data_test)

data_test['Pawpularity'] = predictions['Label']
data_test[['Id','Pawpularity']].to_csv('submission.csv',index=False)
data_test[['Id','Pawpularity']]

In [None]:
cb = create_model('lar')

In [None]:
predictions2 = predict_model(cb, data = data_test)

data_test['Pawpularity'] = predictions['Label']*0.5+predictions2['Label']*0.5
data_test[['Id','Pawpularity']].to_csv('submission.csv',index=False)
data_test[['Id','Pawpularity']]

### Kindly upvote if it seems relevant!