In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> **Title 1: Shelter Animals Outcomes Classification**

> Description:
>* Dự đoán tình trạng của động vật sau khi rời khỏi trại cứu hộ.
>* Target: Feature 'Outcomes' bao gồm: Adoption, Died, Euthanasia, Return to owner, Transfer.
>* Test và Train được phân loại ngẫu nhiên.

> Data:
> https://www.kaggle.com/c/shelter-animal-outcomes/data
> * Data chứa gần 27,000 dữ liệu, sử dụng file 'acc_shelter_cat_outcomes_eng.csv'
> Có 10 features.

> Evaluate Ways of approach:
> * Vì đây là bài toán unsupervised classification, nên sẽ khuyến khích sử dụng K-Means Clustering, RandomForests, Logistict Regression.

> Why I choose this:
> * Xoay quanh unsupervised classification để có thể adjust learning rate phù hợp (không bị overfitting, underfitting...)
> * Đề tài động vật giúp tuyên truyền về việc ngược đãi động vật.
> * Đề tài dễ hiểu, features được mô tả rõ ràng.



In [None]:
train = pd.read_csv('/kaggle/input/shelter-animal-outcomes/train.csv')

In [None]:
train.info()

In [None]:
train.head()

# **BASELINE**
**Ref**: https://www.kaggle.com/fayomi/data-exploration-predict-adoption

In [None]:
train.head()


**VISUALIZATION**

In [None]:
sns.heatmap(train.isnull(), yticklabels= False, cbar= False, cmap='viridis')

In [None]:
plt.figure(figsize= (10,6))
plt.title('Count plot of AnimalType')
sns.countplot(train['AnimalType'])

In [None]:
plt.figure(figsize= (10,6))
plt.title('Count plot of Sex upon Outcome')
sns.countplot(train['SexuponOutcome'])

In [None]:
# Sort breed into list
breed_list = []

for item in train['Breed']:
    if 'Mix' in item.split():
        breed_list.append('mixed')
    else:
        breed_list.append('pure')
        
train['New_Breed'] = breed_list

In [None]:
plt.figure(figsize= (10,6))
plt.title('Count plot of Breed consist of "mixed" and "pure" ')
sns.countplot(train['New_Breed'])

In [None]:
plt.figure(figsize= (10,6))
plt.title('Count plot of Outcome Type')
sns.countplot(train['OutcomeType'])

In [None]:
plt.figure(figsize= (10,6))
plt.title('Count plot of Outcome Type by Animal Type')
sns.countplot(x='OutcomeType',data = train, hue= 'AnimalType')

In [None]:
plt.figure(figsize=(14,6))
plt.title('Count plot of Sex by Outcome Type')
sns.countplot(x='SexuponOutcome',data=train,hue='OutcomeType')

In [None]:
plt.title('Count plot of New Breed by Outcome Type')
sns.countplot(x='New_Breed', data=train, hue='OutcomeType')

In [None]:
train.head()

**DROP UNECESSARY FEATURES**

In [None]:
# Drop less important features
train.drop(['AnimalID','Name','DateTime','OutcomeSubtype','AgeuponOutcome','Breed','Color'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
outcome = pd.get_dummies(train['OutcomeType'])
animal = pd.get_dummies(train['AnimalType'],drop_first=True)
sex = pd.get_dummies(train['SexuponOutcome'],drop_first=True)
breed = pd.get_dummies(train['New_Breed'],drop_first=True)

In [None]:
train = pd.concat([train,outcome,animal,sex,breed],axis=1)

In [None]:
train.head()

**SPLIT TRAIN TEST**

In [None]:
train.drop(['OutcomeType','AnimalType','SexuponOutcome','New_Breed','Transfer','Euthanasia','Return_to_owner','Died'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
X = train[['Dog','Intact Male','Neutered Male','Spayed Female','Unknown','pure']]
y = train['Adoption']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state= 101)

**TRAINING MODEL USING LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lm = LogisticRegression()
lm.fit(X_train, y_train)

In [None]:
prediction = lm.predict(X_test)

**EVALUATE MODEL AND ADD SUBMISSION**

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,prediction))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, prediction))

In [None]:
result = prediction.to_csv('submission.csv', index = False )