In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier

In [60]:
train = pd.read_csv('train.csv', delimiter=',')
test = pd.read_csv('test.csv', delimiter=',')

In [61]:
train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [89]:
train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(),inplace=True)

In [90]:
train.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,2.0,19,0.8,7.78,13,9,0.0,1
1,1.0,54,0.72,14.19,13,9,0.0,2
2,1.0,16,0.15,40.9,15,4,2.0,4
3,1.0,54,0.62,17.82,0,1,0.0,2
4,2.0,3,0.5,11.06,18,4,0.0,1


In [91]:
labels = train['color_type'].astype('category').cat.categories.tolist()
replace = {'color_type' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
print(replace)

{'color_type': {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54, 55: 55, 56: 56}}


In [92]:
train.replace(replace, inplace=True)
train.head()

ValueError: Replacement not allowed with overlapping keys and values

In [93]:
labels_test = test['color_type'].astype('category').cat.categories.tolist()
replace_test = {'color_type' : {k: v for k,v in zip(labels,list(range(1,len(labels_test)+1)))}}
print(replace_test)

{'color_type': {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19, 20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38, 39: 39, 40: 40, 41: 41, 42: 42, 43: 43, 44: 44, 45: 45, 46: 46, 47: 47, 48: 48, 49: 49, 50: 50, 51: 51, 52: 52, 53: 53, 54: 54}}


In [94]:
test.replace(replace, inplace=True)
test.head()

ValueError: Replacement not allowed with overlapping keys and values

In [95]:
train = train.drop(['pet_id', 'issue_date', 'listing_date'], axis=1)
train.head()

KeyError: "['pet_id' 'issue_date' 'listing_date'] not found in axis"

In [96]:
test = test.drop(['pet_id', 'issue_date', 'listing_date'], axis=1)
test.head()

KeyError: "['pet_id' 'issue_date' 'listing_date'] not found in axis"

In [70]:
train_y_breed = train['breed_category']
train_y_pet = train['pet_category']

In [71]:
train_y_pet.head()

0    1
1    2
2    4
3    2
4    1
Name: pet_category, dtype: int64

In [72]:
train_x = train.drop(['breed_category', 'pet_category'], axis=1)
train_x.head()

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2
0,2.0,19,0.8,7.78,13,9
1,1.0,54,0.72,14.19,13,9
2,1.0,16,0.15,40.9,15,4
3,1.0,54,0.62,17.82,0,1
4,2.0,3,0.5,11.06,18,4


In [73]:
x_train_breed,x_val_breed,y_train_breed,y_val_breed=train_test_split(train_x,train_y_breed,test_size=0.2)
x_train_pet,x_val_pet,y_train_pet,y_val_pet=train_test_split(train_x,train_y_pet,test_size=0.2)

In [74]:
log_breed = RandomForestClassifier(max_depth=15, random_state=45)
log_pet = RandomForestClassifier(max_depth=15, random_state=45)

In [75]:
log_breed.fit(x_train_breed, y_train_breed)
log_pet.fit(x_train_pet, y_train_pet)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=45, verbose=0,
                       warm_start=False)

In [76]:
pred_breed = log_breed.predict(x_val_breed)
pred_pet = log_pet.predict(x_val_pet)

In [77]:
print(f1_score(y_val_breed, pred_breed, average='weighted'))
print(f1_score(y_val_pet, pred_pet, average='weighted'))

0.853670032158533
0.8376774712927539


In [78]:
clf_breed = RandomForestClassifier(max_depth=15, random_state=45)
clf_pet = RandomForestClassifier(max_depth=15, random_state=45)

In [79]:
clf_breed.fit(train_x, train_y_breed)
clf_pet.fit(train_x, train_y_pet)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=15, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=45, verbose=0,
                       warm_start=False)

In [80]:
predict_breed = clf_breed.predict(test)
predict_pet = clf_pet.predict(test)

In [81]:
ro=np.shape(test)[0]

In [82]:
predict_breed.resize(ro,1)
predict_pet.resize(ro,1)

In [83]:
test_data = pd.read_csv('test.csv', delimiter=',')

In [84]:
pet_id = test_data['pet_id']
pet_id.head()

0    ANSL_75005
1    ANSL_76663
2    ANSL_58259
3    ANSL_67171
4    ANSL_72871
Name: pet_id, dtype: object

In [85]:
solution = pd.DataFrame(pet_id, columns =['pet_id'])
solution.head()

Unnamed: 0,pet_id
0,ANSL_75005
1,ANSL_76663
2,ANSL_58259
3,ANSL_67171
4,ANSL_72871


In [86]:
solution['breed_category'] = predict_breed
solution['pet_category'] = predict_pet
solution.head()

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,1.0,2
1,ANSL_76663,0.0,1
2,ANSL_58259,0.0,2
3,ANSL_67171,0.0,1
4,ANSL_72871,0.0,2


In [87]:
solution["breed_category"] = solution["breed_category"].astype(int)
solution.head()

Unnamed: 0,pet_id,breed_category,pet_category
0,ANSL_75005,1,2
1,ANSL_76663,0,1
2,ANSL_58259,0,2
3,ANSL_67171,0,1
4,ANSL_72871,0,2


In [88]:
solution.to_csv('pred.csv',index=False)