In [None]:
#libraries
import numpy as np 
import pandas as pd 
import os
import json
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore")

In [None]:
breeds = pd.read_csv('../input/breed_labels.csv')
colors = pd.read_csv('../input/color_labels.csv')
states = pd.read_csv('../input/state_labels.csv')

train = pd.read_csv('../input/train/train.csv')
test = pd.read_csv('../input/test/test.csv')
sub = pd.read_csv('../input/test/sample_submission.csv')

train['DatasetType'] = 'train'
test['DatasetType'] = 'test'
all_data = pd.concat([train, test])

In [None]:
print(os.listdir("../input"))

In [None]:
train.drop('Description', axis=1).head()

In [None]:
train.info()

* We have almost 15 thousands dogs and cats in the dataset;
* Main dataset contains all important information about pets: age, breed, color, some characteristics and other things;
* Desctiptions were analyzed using Google's Natural Language API providing sentiments and entities. I suppose we could do a similar thing ourselves;
* There are photos of some pets;
* Some meta-information was extracted from images and we can use it;
* There are separate files with labels for breeds, colors and states;
Let's start with the main dataset.

I have also created a full dataset by combining train and test data. This is done purely for more convenient visualization. Column "dataset_type" shows which dataset the data belongs to.

## Data exploration

**Target: Adoption speed**

* 0 - Pet was adopted on the same day as it was listed.
* 1 - Pet was adopted between 1 and 7 days (1st week) after being listed.
* 2 - Pet was adopted between 8 and 30 days (1st month) after being listed.
* 3 - Pet was adopted between 31 and 90 days (2nd & 3rd month) after being listed.
* 4 - No adoption after 100 days of being listed. (There are no pets in this dataset that waited between 90 and 100 days).

In [None]:
non_txt_features = train.columns.difference(["Name", "State", "RescuerID", "PetID", "DatasetType"])
train[non_txt_features].hist(figsize=(20,20))

**In training dataset:**
* Adoption Speed: Just a few lucky pets were adopted immediately (< 500/15000 ~ 3%). The majority of pets are not adopted at all (< 4200/15000 ~ 28%). About 27% (4000/15000) were adopted within 1 month.
* Type: 56% dogs & 44% cats.
* Age: Most pets are under 2 months age.
* Color: Black and brown are the most popular.
* FurLength: Most pets have short fur (57%). Only 7% have long fur.
* Gender: 50% are female, 33% are male & others are mixed.
* Health: Most pets are healthy (95%).
* MaturitySize: Medium counts 80%.
* Dewormed: Most pets were dewormed (55%).
* Sterilized: Most pets have not been neutered yet (67%). Only 17% were neutered.
* Vaccinated: 47% not vaccinated, 40% vaccinated.
* Fee: Most pets are free.
* Quantity: 80% there is only 1 pet per advertisement
* PhotoAmt: Most pets have less than 5 photos.
* VideoAmt: Most pets do not have video.

In [None]:
non_txt_features = train.columns.difference(["Name", "State", "RescuerID", "PetID", "DatasetType", "AdoptionSpeed"])
test[non_txt_features].hist(figsize=(20,20))

In [None]:
all_data[non_txt_features].hist(figsize=(20,20))

## Logistic Regression

In [None]:
import sklearn
from sklearn.preprocessing import scale 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import preprocessing

### Preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
l = LogisticRegression()

In [None]:
features = train.columns.difference(["Name", "Description", "PetID", "RescuerID", "AdoptionSpeed"])
train_x = train[features]
train_y = train["AdoptionSpeed"]
test_x = test[features]

In [None]:
from sklearn.model_selection import train_test_split
enc.fit(train_x)
x_train_1h = enc.transform(train_x)
train_xx,cv_x,train_yy,cv_y=train_test_split(x_train_1h,train_y,test_size=0.2)
l.fit(train_xx,list(train_yy))
y_pred = l.predict(cv_x)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(list(cv_y),y_pred)
accuracy

In [None]:
enc.fit(train_x)
X_train_one_hot = enc.transform(train_x)
X_test_one_hot = enc.transform(test_x)
l.fit(X_train_one_hot,train_y)
y_pred = l.predict(X_test_one_hot)
print(X_train_one_hot.shape)

In [None]:
y_pred

In [None]:
sub.head()

In [None]:
for i,val in enumerate(y_pred):
    sub.at[i,'AdoptionSpeed'] = val
sub.AdoptionSpeed = sub.AdoptionSpeed.astype(int)
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)