In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

plt.style.use('fivethirtyeight')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

How to determine the peacefulness of the future pet by gender, color and breed? Perhaps there is such a dependence. To determine this dependence, I will go through the following steps:
1. Deleting columns with too large gaps, while trying to restore some missing values in the target variables
2. I will analyze the dependencies on the target variable. 
3. Processing unnecessary signs in the case of multicollinearity.
4. I will build machine learning models to determine the probability that a pet is friendly to a person
5. I will build a dashboard that will help you find a good friend.

** The purpose of NoteBook** is to assign a probability of friendliness to each pet for subsequent visualization in PowerBI.

## Uploading data and promo pages of general summary statistics

In [None]:
df = pd.read_csv("/kaggle/input/adoptable-dogs/ShelterDogs.csv")
df

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
#Most of the features are categorical
df.describe(include=[object]).T

In [None]:
#The variable is returned without outliers. It is the only kolichesvennaya, therefore, emissions will not be processed
df.describe()

In [None]:
# The target variable is full of omissions. Let's assume that most NaNs are 'no'
df['likes_people'].value_counts()

In [None]:
df['date_found'] = pd.to_datetime(df['date_found'])
df['adoptable_from'] = pd.to_datetime(df['adoptable_from'])
df['posted'] = pd.to_datetime(df['posted'])

## Restoring the missing values to the target variable

If the dog likes at least a child, a man or a woman, this should be reflected as "yes"in the likes_people column. If not, then I change Nan to 'no' in the corresponding linesЕсли собака любит хотябы ребёнка или мужчин или женщин, в столбцe likes_people это должно быть отражено как "yes". Если нет, то меняю Nan на yes в соответвующих строках

In [None]:
#There are such objects
df.loc[(df["likes_people"].isnull()) & (df['likes_children'] == 'yes')].head()

In [None]:
#Making a replacement for the rest of the objects
df.loc[(df["likes_people"].isnull()) & (df['get_along_males'] == 'yes'), 'likes_people'] = 'yes'
df.loc[(df["likes_people"].isnull()) & (df['get_along_females'] == 'yes'), 'likes_people'] = 'yes'
df.loc[(df["likes_people"].isnull()) & (df['likes_children'] == 'yes'), 'likes_people'] = 'yes'

In [None]:
#We were able to recover a couple of hundred missing values. Let the remaining Nan be 'no'
df.isnull().sum()

In [None]:
#The shelter may not always know where the dogs were kept. But the 'keep_in' column itself may be of value, 
#therefore, we will leave it.
df_fill_likes_people = df.copy()
df_fill_likes_people['likes_people'] = df['likes_people'].fillna('no')
df_fill_likes_people['keep_in'] = df['keep_in'].fillna('unknow')
df_fill_likes_people.drop(['housebroken', 'get_along_cats', 'get_along_females', 'get_along_males', 'likes_children'], axis=1, inplace=True)

In [None]:
df_fill_likes_people['keep_in'].value_counts()

In [None]:
df_fill_likes_people

In [None]:
#I will divide the dogs by whether there is a name or not. Perhaps the presence of the name played a role in the' socialization ' of the dog
#It will be interesting to check whether castration has affected the positive attitude of the dog to the person.
df_fill_likes_people['neutered'] = df_fill_likes_people['neutered'].fillna('unknow')
df_fill_likes_people['name'] = df_fill_likes_people['name'].fillna("no")

def name(x):
    if x == 'no':
        return 1
    else:
        return 0
    
df_fill_likes_people['name'] = df_fill_likes_people['name'].apply(name)
df_fill_likes_people['name']

In [None]:
df_fill_likes_people.isnull().sum()

In [None]:
df_fill_likes_people['likes_people'].value_counts()

The advantage in the direction of "yes" by 3 times is quite logical. Dogs are more peaceful thanks to breeding. Now let's check the dependence of the target variable on the other features

## Data analysis

In [None]:
#The most popular name for a dog and what is the most common name for dogs that love people
def bar_likes(column):
    fig, ax = plt.subplots(1, 2)
    ax1, ax2 = ax.flatten()
    fig.set_size_inches(20, 6)
    fig.autofmt_xdate()

    names_counts = df_fill_likes_people[column].value_counts().head(10)
    ax1.bar(names_counts.index, names_counts, color='#76A3DE')
    ax1.set_title('The most popular {}'.format(column))

    names_likes_people = df_fill_likes_people.loc[df_fill_likes_people['likes_people'] == 'yes', column].value_counts().head(10)
    ax2.bar(names_likes_people.index, names_likes_people, color='#FAB464')
    ax2.set_title('The most popular {} among likes people'.format(column))

In [None]:
#The most popular breed of dog and which breed is most common in dogs that love people
#The difference in a couple of names is mostly accidentalВ общем зависимость не наблюдается, т.к. имена в одной таблице практически совпадают
bar_likes('breed')

In [None]:
#The most popular coat color in dogs
#No significant difference is observed
bar_likes('color')

In [None]:
#The most popular dog size
#and is there a statistical difference between them 
df_fill_likes_people.groupby(['size', 'likes_people'])['likes_people'].count()

In [None]:
## Dogs whose place of residence is unknown are the most angry. Probably homeless.
df_fill_likes_people.groupby(['keep_in', 'likes_people'])['likes_people'].count()

In [None]:
# Dependence on gender is unknown. I will check it statistically
df_fill_likes_people.groupby(['sex', 'likes_people'])['likes_people'].count()

In [None]:
#Who is more peaceful, female or male? Is this statistically significant difference?
df_fill_likes_people.groupby(['sex', 'likes_people'])['likes_people'].count()
A = df_fill_likes_people[df_fill_likes_people['sex'] == 'male']['likes_people'].map({'yes': 1, 'no': 0})
B = df_fill_likes_people[df_fill_likes_people['sex'] == 'female']['likes_people'].map({'yes': 1, 'no': 0})

In [None]:
#Кто миролюбивее самка или самец? Если да, то статистически ли значимая это разница?
def A_B_Test(A, B):
    
    ntA = stats.shapiro(A)[1] < 0.05
    ntB = stats.shapiro(B)[1] < 0.05
    
    if (ntA == False) & (ntB == False):
        leveneTest = stats.levene(A, B)[1] < 0.05
        
        if leveneTest == False:
            print('ttest, with equal_var')
            ttest = stats.ttest_ind(A, B, equal_var=True)[1]
            return ttest < 0.05
        else:
            print('ttest, without equal_var')
            ttest = stats.ttest_ind(A, B, equal_var=False)[1]
            return ttest < 0.05
    else:
        print("mannwhitneyuy: ")
        mannwhitneyu = stats.mannwhitneyu(A, B)[1] 
        return mannwhitneyu < 0.05
    
if A_B_Test(A, B) == True:
    print('Различия статистически значимы')
else:
    print('Различия статистически НЕ значимы')

## And let's go to dummie and then PCA

In [None]:
#The curse of dimension breathes in the back.
y = df_fill_likes_people['likes_people'].map({'yes': 1, 'no': 0})
df_to_dummie = df_fill_likes_people.drop(['ID', 'date_found', 
                                        'adoptable_from', 'posted', 'likes_people'], axis=1)
to_dummie = df_to_dummie.select_dtypes('object').columns
df_dummie = pd.get_dummies(df_to_dummie, columns=to_dummie, drop_first=True)
df_dummie

In [None]:
#We solve it.
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_dummie['age'] = scaler.fit_transform(np.array(df_dummie['age']).reshape(-1, 1))
decomp = PCA(311)
decomp.fit(df_dummie)

## Feature Eingineering 

In [None]:
#Approximately 50 features can be left
plt.plot(np.cumsum(decomp.explained_variance_ratio_), '*--');

In [None]:
df_pca = PCA(50).fit_transform(df_dummie) 

## Model

### LogisticRegression

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_pca, y,
                                                    test_size=0.25)
X_train

In [None]:
from sklearn.linear_model import LogisticRegressionCV
log_model = LogisticRegressionCV(cv=3)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred_log, y_test))

In [None]:
log_model.score(X_train, y_train)

In [None]:
#As for me, the result is worthy. The model can be used to determine the probability of belonging to a class
log_model.score(X_test, y_test)

### RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
n_estimator = [10, 50, 100, 1000]
max_depth = [2, 5, 7, 15, 30]
hyperparameters = dict(n_estimators=n_estimator,
                       max_depth= max_depth)
forest = RandomForestClassifier()
gridsearch = GridSearchCV(forest, hyperparameters, cv=5, verbose=1)
best_model = gridsearch.fit(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)
print(classification_report(y_pred, y_test))

### CatBoost

In [None]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier(verbose=0)
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5]}

grid_search_result = cat_model.grid_search(grid, X=X_train, y=y_train, cv=3, plot=True, verbose=0)

In [None]:
#CatBoost copes better than others. I use it to get probabilities
cat_predict = CatBoostClassifier(depth=4, l2_leaf_reg=3, learning_rate=0.03)
cat_predict.fit(X_train, y_train, verbose=0)
cat_pred = cat_predict.predict(X_test) 
print(classification_report(cat_pred, y_test))

## I get the probabilities and save the model

In [None]:
pribabylity = cat_predict.predict_proba(df_pca)
pribabylity[:, 1]

In [None]:
df_fill_likes_people['prob_likes'] = pribabylity[:, 1]

In [None]:
#I'm coding some features to simplify visualization
bin_labels_5 = [1, 2, 3 ,4, 5, 6, 7, 8, 9, 10]
df_fill_likes_people['labels'] = pd.qcut(df_fill_likes_people['prob_likes'],
                              q = [.0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.],
                              labels=bin_labels_5)
age = ['Puppy', 'Young', 'Middle_age', 'Old']
df_fill_likes_people['age_interval'] = pd.qcut(df_fill_likes_people['age'], 
                                             q = [.0, .1, .5, .7, 1],
                                             labels=age)

In [None]:
df_fill_likes_people['age_interval']

In [None]:
df_fill_likes_people

In [None]:
df_fill_likes_people.to_csv("dogs_clean.csv")
cat_predict.save_model('dogs_proba', format="cbm")