# IMPORTING THE LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import datetime
import re
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# READING THE CSV FILES

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df_train.describe().T

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category
count,17357.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0
mean,0.88339,0.502636,27.448832,5.369598,4.577307,0.600563,1.709143
std,0.770434,0.288705,13.019781,6.572366,3.517763,0.629883,0.717919
min,0.0,0.0,5.0,0.0,0.0,0.0,0.0
25%,0.0,0.25,16.1725,0.0,1.0,0.0,1.0
50%,1.0,0.5,27.34,0.0,4.0,1.0,2.0
75%,1.0,0.76,38.89,13.0,9.0,1.0,2.0
max,2.0,1.0,50.0,19.0,9.0,2.0,4.0


In [None]:
df_test.describe().T

# SCALING THE LENGTH COLUMN

In [None]:
df_train['length(m)'] = df_train['length(m)'].apply(lambda x: x*100)
df_test['length(m)'] = df_test['length(m)'].apply(lambda x: x*100)

# IMPUTING THE MISSING VALUES

In [None]:
df_train['length(m)'] = df_train['length(m)'].replace(0,df_train['length(m)'].mean())
df_train.condition.fillna(0.0, inplace = True)
df_test.condition.fillna(0.0, inplace = True)

# GENERATING THE TIME FEATURE

In [None]:
def calcdays(row):
    st = row['issue_date']
    et = row['listing_date']
    match = re.search(r'\d{4}-\d{2}-\d{2}', st) 
    match2 = re.search(r'\d{4}-\d{2}-\d{2}', et) 
    sdate = datetime.datetime.strptime(match.group(), '%Y-%m-%d').date()
    edate = datetime.datetime.strptime(match2.group(), '%Y-%m-%d').date()
    return (edate - sdate).days


df_train['time'] = df_train.apply(calcdays, axis = 'columns')
df_test['time'] = df_test.apply(calcdays, axis = 'columns')

#DROPING THE COLUMNS 

In [None]:
df_train.drop(['pet_id', 'issue_date', 'listing_date'], axis = 1, inplace = True)
df = df_test.copy()
df_test.drop(['pet_id', 'issue_date', 'listing_date'], axis = 1, inplace = True)

# LABEL ENCODING THE COLOR_TYPE COLUMN

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_train['color_type'] = le.fit_transform(df_train['color_type'])
df_test['color_type'] = le.transform(df_test['color_type'])

# MAKING SPLITS OF DATASET

In [None]:
X_train = df_train.copy()
X_train = X_train.drop(['breed_category','pet_category'], axis = 1)
X_valid = df_test.copy()
y_breed = df_train['breed_category']
y_pet = df_train['pet_category']

In [None]:
X_valid['length(m)'] = X_valid['length(m)'].apply(lambda x: x*100)

# CHECKING CROSS VALIDATION SCORE

In [None]:
my_pipeline = Pipeline(steps = [
                                ('StandardScaler',StandardScaler()),
                                ('rfc', XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                        min_child_weight=3))                        
])
from sklearn.model_selection import cross_val_score
scores = cross_val_score(my_pipeline, X_train, y_pet, scoring = 'accuracy', cv=5)
print(scores.mean())

0.8972600895413265


# STANDARD SCALING

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_valid = ss.fit_transform(X_valid)

# MAKING THE MODEL

In [None]:
pet = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                        min_child_weight=3)
pet.fit(X_train, y_pet)
y_pet_pred = pet.predict(X_valid)

In [None]:
breed = XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=5,
                        min_child_weight=3)
breed.fit(X_train, y_breed)
y_breed_pred = breed.predict(X_valid)

# MAKING PREDICTION FILE

In [None]:
submission = pd.DataFrame({
    'pet_id':df['pet_id'],
    'breed_category':y_breed_pred,
    'pet_category':y_pet_pred
})
submission.to_csv("submission.csv", index = False)

In [None]:
submission['breed_category'].value_counts()