In [16]:
import numpy as np
import pandas as pd
from scipy import sparse
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn import metrics
import string
from datetime import datetime
from catboost import CatBoostClassifier, Pool
train_df = pd.read_json("/kaggle/input/two-sigma-connect-rental-listing-inquiries/train.json.zip", compression='zip').reset_index().drop(['index'],axis=1)
test_df = pd.read_json("/kaggle/input/two-sigma-connect-rental-listing-inquiries/test.json.zip", compression='zip').reset_index().drop(['index'],axis=1)

In [17]:
def get_features(train_df):
    new_ft=[]
    al=[]
    for i in range(train_df.shape[0]):
        new_feature=[]
        for x in train_df['features'][i]:
            st=x.lower() 
            for character in string.punctuation:
                st = st.replace(character, '')
            #cats allowed,dogs allowed = animals_allowed
            #hardwood floors,hardwood = hardwood_floor
            #laundry in building,laundry in unit = laundry_near
            #outdoor space,gardenpatio = outdoor_space
            st=st.replace('cats allowed','animals_allowed').replace('dogs allowed','animals_allowed')
            st=st.replace('hardwood floors','hard_floor').replace('hardwood','hard_floor')
            st=st.replace('laundry in building','laundry_near').replace('laundry in unit','laundry_near')
            st=st.replace('outdoor space','outdoor_space').replace('gardenpatio','outdoor_space').replace('common outdoor_space','outdoor_space')#common outdoor_space      
            al.append(st)
            al.append(st)
            new_feature.append(st)
        new_ft.append(list(dict.fromkeys(new_feature)))
    train_new = pd.DataFrame()    
    train_new['new_features']=new_ft
    month_half=[]
    month_number=[]   #building_id   manager_id  display_address bedrooms price
    for i in range(train_df.shape[0]):
        dtm = datetime.strptime(train_df['created'][i],'%Y-%m-%d %H:%M:%S')
        month_number.append(dtm.month)
        if dtm.day<15:
            month_half.append(0)
        else:
            month_half.append(1)
            
    train_df['display_address'] = train_df['display_address'].astype(str).apply(str.lower)
    train_df['display_address'] = train_df['display_address'].replace(['\sst\s', '\sst$'], ' street', regex = True)
    train_df['display_address'] = train_df['display_address'].replace(['\save\s', '\save$'], ' avenue', regex = True)
    train_df['display_address'] = train_df['display_address'].replace(['\se\s', '^e\s'], ' east ', regex = True)
    train_df['display_address'] = train_df['display_address'].replace(['\sw\s', '^w\s'], ' west ', regex = True)

    train_new['bathrooms']=train_df['bathrooms']//1
    train_new['half_bathrooms']=(train_df['bathrooms']%1)*2
    train_new['photos_cnt']=train_df['photos'].apply(len)
    train_new['month_number']=month_number
    train_new['month_half']=month_half
    #train_new['display_address']=train_df['display_address'].apply(str.lower)
    #train_new['building_id']=train_df['building_id']
    #train_new['manager_id']=train_df['manager_id']
    train_new['bedrooms']=train_df['bedrooms']
    train_new['display_address']=train_df['display_address']
    train_new['price']=train_df['price']// 500 * 500
    train_new
    return train_new,al

In [18]:
def common_features(al):
    common=list(dict(sorted(dict(Counter(al)).items(), key=lambda item: item[1],reverse=True)).keys())[:15]
    return common

In [19]:
def more_features(train_new,common_features):
    df = pd.DataFrame(columns=common_features)
    for i in range(train_new.shape[0]):#train_df.shape[0]
        ftr=[]
        for feature in common_features:
            if feature in train_new['new_features'][i]:
                ftr.append(1)
            else:
                ftr.append(0)
        df=df.append(pd.DataFrame([ftr],columns=common_features),ignore_index=True)
    train_new2=train_new.join(df).drop('new_features',axis=1)
    return train_new2

In [20]:
train_new=get_features(train_df)[0]
cf=common_features(get_features(train_df)[1])
train=more_features(train_new,cf)
#train
test_new=get_features(test_df)[0]
test=more_features(test_new,cf)
#test

Unnamed: 0,bathrooms,half_bathrooms,photos_cnt,month_number,month_half,bedrooms,display_address,price,animals_allowed,laundry_near,...,dishwasher,no fee,fitness center,prewar,outdoor_space,roof deck,dining room,high speed internet,balcony,swimming pool
0,1.0,0.0,8,6,0,1,suffolk street,2500,0,1,...,1,0,0,0,1,0,0,0,0,0
1,1.0,0.0,3,6,1,2,thompson street,2500,1,0,...,0,0,0,1,0,0,0,0,0,0
2,1.0,0.0,1,6,1,0,sullivan street,2000,1,0,...,0,0,0,1,0,0,0,0,0,0
3,1.0,0.0,4,6,1,2,jones street,2500,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,6,6,1,1,exchange place,3000,1,1,...,0,0,1,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74654,1.0,0.0,10,4,1,1,150 east 107th street,1500,0,0,...,0,0,0,0,0,0,0,0,0,0
74655,1.0,0.0,4,4,1,2,east 33rd st.,4000,1,1,...,1,1,0,0,0,0,0,0,0,0
74656,1.0,0.0,0,4,1,0,lexington avenue,2000,1,0,...,0,0,0,0,0,0,0,0,0,0
74657,2.0,0.0,8,4,0,2,park avenue,6500,1,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
train_data = train.drop('display_address',axis=1)
train_labels = train_df['interest_level'] 
test_data = test.drop('display_address',axis=1)

model = CatBoostClassifier(iterations=100,
                           depth=4,
                           random_seed = 42,
                           learning_rate=0.05,
                           loss_function='MultiClass',
                           verbose=False)

model.fit(train_data,train_labels)

preds_class = model.predict(test_data)
preds_proba_gs = model.predict_proba(test_data)
df333=pd.DataFrame(preds_proba_gs)
submission = pd.DataFrame()#listing_id,high,medium,low    
submission['listing_id']=test_df['listing_id']
submission['high']=df333[0]
submission['medium']=df333[1]
submission['low']=df333[2]
submission.to_csv('submission.csv', index=False)