In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

# 1. Data Exploration

In [None]:
train_df = pd.read_csv('../input/train/train.csv')
train_df.head(3)

In [None]:
train_df.describe()

##  Missing data review
There are only Pet's name of mising data. But we don't need Pet's name for feature extraction.

In [None]:
import missingno as msno
msno.bar(train_df,figsize=(20,4))

## Label
Categorical speed of adoption. Lower is faster. 

In [None]:
plt.style.use('ggplot')
train_df.AdoptionSpeed.value_counts().plot(kind='bar')

## Type
Type of animal (1 = Dog, 2 = Cat)

In [None]:
train_df.Type.value_counts().plot(kind='bar')

## Bread
* Breed1 - Primary breed of pet (Refer to BreedLabels dictionary)
* Breed2 - Secondary breed of pet, if pet is of mixed breed (Refer to BreedLabels dictionary)

In [None]:
breed_label = pd.read_csv('../input/breed_labels.csv')
breed_label.head()

In [None]:
train_df.Breed1.value_counts().reset_index().join(breed_label.set_index('BreedID'),on='index').rename(columns={'index':'Breed1','Breed1':'Count'}).tail()

In [None]:
train_df.Breed2.value_counts().reset_index().join(breed_label.set_index('BreedID'),on='index').rename(columns={'index':'Breed2','Breed2':'Count'}).tail()

If the data of Bread1 is equal Bread2, the pet is mix-bread. Maybe mix-bread is the significant feature in model training.

In [None]:
train_df.Breed2.loc[train_df.Breed1==train_df.Breed2] = 0

In [None]:
train_df.Breed2.loc[train_df.Breed1==train_df.Breed2]

In [None]:
train_df['Mixed_Breed'] = train_df.apply(lambda x: 0 if x.Breed2==0 and x.Breed1!=307 else 1, axis=1)

In [None]:
train_df[train_df["Breed2"]!=0].head(3)

In [None]:
train_df.Mixed_Breed.value_counts().plot(kind='bar')

## Color_Label

In [None]:
color_label = pd.read_csv('../input/color_labels.csv')
color_label

In [None]:
train_df.Color1.value_counts().reset_index().join(color_label.set_index('ColorID'),on='index').rename(columns={'index':'Color1','Color1':'Count'})

In [None]:
train_df.Color2.value_counts().reset_index().join(color_label.set_index('ColorID'),on='index').rename(columns={'index':'Color2','Color2':'Count'})

In [None]:
train_df.Color3.value_counts().reset_index().join(color_label.set_index('ColorID'),on='index').rename(columns={'index':'Color3','Color3':'Count'})

The number of color maybe the significant feature for model training.

In [None]:
train_df['Num_Color'] = train_df.apply(lambda x:  3-sum([y==0 for y in [x.Color1, x.Color2, x.Color3]]), axis=1)

In [None]:
train_df.Num_Color.value_counts().plot(kind='bar')

## Size

In [None]:
train_df.MaturitySize.value_counts().plot(kind='bar')

## State_label

In [None]:
state_label = pd.read_csv('../input/state_labels.csv')
state_label

In [None]:
train_df.State.value_counts().reset_index().join(state_label.set_index('StateID'),on='index').rename(columns={'index':'State','State':'Count'})

## Description Sentiment
The length of description maybe is the significant feature for model training. 

In [None]:
train_df['Description'].fillna("", inplace=True)
train_df['Description_Length'] = train_df.Description.map(len)

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(x='AdoptionSpeed', y='Description_Length', data=train_df, showfliers=False)

 # 2. Feature Extraction

In [None]:
y = train_df['AdoptionSpeed']

In [None]:
train_df.info()

In [None]:
x = train_df.drop(["Name","RescuerID","Description","PetID", "AdoptionSpeed"], axis = 1)

In [None]:
x.head(3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x, y)

#sns.set(style="darkgrid")
fig, ax = plt.subplots(figsize=(6,6))
y_pos = np.arange(x.shape[1])
plt.barh(y_pos, model.feature_importances_, align='center', alpha=0.4)
plt.yticks(y_pos, x)
plt.xlabel('features')
plt.title('feature_importances')
plt.show()

Reduce the dimension of feature by PCA.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(10)
newdata = pca.fit_transform(x)
newdata.shape

In [None]:
pca.explained_variance_ratio_      # 百分比

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_pca = pca.fit_transform(ss.fit_transform(x))

In [None]:
x_pca.shape

In [None]:
y.shape

# 3. Model Training

In [None]:
y2 = y.values

In [None]:
from sklearn.ensemble import RandomForestClassifier
 
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=100,
                             min_samples_split=12, #20
                             min_samples_leaf=1,
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1) 

rf.fit(x, y2) #filter SP data
print("Out Of Bag score is %.4f" % rf.oob_score_)

In [None]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets with 20% test rate
X_train, X_test, y_train, y_test = train_test_split(x, y2, test_size = 0.2, random_state = 0)

In [None]:
# Training model
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(random_state=101)
RFC.fit(X_train,y_train)

# Import 4 metrics from sklearn for testing
from sklearn.metrics import accuracy_score
print ("Accuracy on testing data of RandomForestClassifier: {:.4f}".format(accuracy_score(y_test, RFC.predict(X_test))))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, RFC.predict(X_test))

# 4. Model Tuning
Memory issue, we don't go GridSearch at here.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {'n_estimators':[10, 50, 100, 200, 400],
              'min_samples_split':[8,12,16,20],
              'min_samples_leaf':[1,2,3,4,5]
             }

In [None]:
#grid = GridSearchCV(rf, parameters)
#grid_fit = grid.fit(x, y2)

In [None]:
# Get the estimator
#best_rf = grid_fit.best_estimator_

In [None]:
# Make predictions using the unoptimized and model
#predictions_rf = (rf.fit(X_train, y_train)).predict(X_test)
#best_predictions_rf = best_rf.predict(X_test)

#from sklearn.metrics import accuracy_score
#print ("Accuracy on testing data of RandomForestClassifier: {:.4f}".format(accuracy_score(y_test, best_predictions_rf)))

In [None]:
#from sklearn.metrics import confusion_matrix
#confusion_matrix(y_test, best_predictions_rf)