In [None]:
#!conda install geopandas
#!conda install folium

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap
from collections import Counter
%matplotlib inline

In [None]:
df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip').reset_index(drop=True)

In [None]:
df.info()

In [None]:
df.head(2)

# EDA

### target

In [None]:
vc = df['interest_level'].value_counts()
plt.bar(vc.index, vc.values,width=0.5)

#### Временные метки

In [None]:
df[["created"]] = df[["created"]].apply(pd.to_datetime)

In [None]:
df.groupby(df['created'].dt.month).size().plot.bar()

In [None]:
df.groupby(df['created'].dt.hour).size().plot.bar()

#### building_id

In [None]:
building_counts = pd.DataFrame(df['building_id'].value_counts()).reset_index()
building_counts = building_counts.rename(columns={'index': 'id', 'building_id': 'count'})
building_counts.head(2)

In [None]:
plt.hist(building_counts.loc[building_counts['id'] != '0']['count'], log=True)
plt.show()

#### description

In [None]:
df['description'][0]

#### display_address

In [None]:
addresses = pd.DataFrame(df['display_address'].value_counts()).reset_index()
addresses.head(2)

In [None]:
plt.hist(addresses['display_address'])
plt.show()

In [None]:
df['display_address'].value_counts()

In [None]:
len(df.loc[df['display_address'] == ''])

#### features

In [None]:
df['features']

In [None]:
feature_counts = Counter([feat for feat_list in df['features'].values for feat in feat_list])
feature_counts.most_common(10)

In [None]:
plt.hist(feature_counts.values(), log=True)
plt.show()

#### latitude longitude

In [None]:
# gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
# gdf.crs = "epsg:4326"

В основном данные по Нью-йорку, но есть и в других городах америки. Также есть нулевые координаты. Отфильтруем данные по условным границам США и отобразим на карте.

In [None]:
folium_map = folium.Map(location=[40.7, -74.1])
HeatMap(list(zip(df.latitude, df.longitude))).add_to(folium_map)
folium_map

#### listing_id

In [None]:
len(df['listing_id'].value_counts())

#### manager_id

In [None]:
manager_counts = pd.DataFrame(df['manager_id'].value_counts()).reset_index()
manager_counts = manager_counts.rename(columns={'index': 'id', 'manager_id': 'count'})
manager_counts.head(2)

In [None]:
plt.hist(manager_counts['count'], log=True)
plt.show()

In [None]:
manager_counts.loc[manager_counts['id']=='']

#### price

In [None]:
len(df.loc[df['price'] > 70000])

In [None]:
plt.hist(df.loc[df['price'] < 70000]['price'],log=True, bins=20)
plt.show()

Значения больше 70000 можно считать выбросами

#### street_address

In [None]:
df['street_address'].value_counts()

In [None]:
len(df.loc[df['street_address']==''])

### base preprocessing

In [None]:
def filter_by_treshold(data, col, treshold, fill_val):
    counts = pd.DataFrame(data[col].value_counts()).reset_index()
    counts = counts.rename(columns={'index': 'id', col: 'count'})
    ids = counts.loc[counts['count'] > treshold]
    data[col] = data.where(data[col].isin(ids['id']), other=fill_val)[col]
    print(f'{col} size reduced from {len(counts)} to {len(ids)}')
    return data

def add_from_lists(data, col_name='features', min_count_features=1):
    data.reset_index(inplace=True, drop=True)
    feature_counts = Counter([feat for feat_list in data[col_name].values for feat in feat_list])
    allowed_features = [k for k,v in feature_counts.items() if v > min_count_features]
    data = pd.concat([data,*[pd.Series(np.zeros(len(data)), name=f'{col_name}_{feat}') for feat in allowed_features]], axis=1)
    for i, row in data.iterrows():
        for j in row[col_name]:
            if j in allowed_features:
                data.at[i, f'{col_name}_{j}'] = 1.0
    return data

In [None]:
df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip')

In [None]:
df[["created"]] = df[["created"]].apply(pd.to_datetime)
df = df.loc[(df['longitude'] > -120) & (df['longitude'] < -60) &
            (df['latitude'] < 50) & (df['latitude'] > 20)]
df = df.loc[df['price'] < 70000]

df['hour'] = df['created'].apply(lambda x: x.hour)
df = filter_by_treshold(df, 'building_id', 10, '0')
df = filter_by_treshold(df, 'display_address', 10, '')
df = filter_by_treshold(df, 'manager_id', 10, '0')
df = filter_by_treshold(df, 'street_address', 10, '')
df = add_from_lists(df, 'features', 1)

exclude_features = ['created', 'description','listing_id', 'photos', 'features']
df.drop(columns=exclude_features, inplace=True)

# Baseline

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.impute import SimpleImputer
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier, Pool

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError(f"DataFrame не содердит следующие колонки: {cols_error}")
            
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [None]:
df.head(2)

In [None]:
target = ['interest_level']
features = [feat for feat in df.columns if feat != target[0]]
categorical_columns = ['building_id', 'display_address', 'manager_id', 'street_address', 'hour']
continuous_columns = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price']
binary_features = [col for col in features if col not in categorical_columns + continuous_columns + target]
target_num_map = {'high':0, 'medium':1, 'low':2}
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(df[features], y, random_state=0, stratify=y)

In [None]:
feature_prep_pipeline = make_pipeline(
    FeatureSelector(columns=features),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            FeatureSelector(continuous_columns),
            SimpleImputer(strategy="median"),
            StandardScaler()
        )),
        ("categorical_features", make_pipeline(
            FeatureSelector(categorical_columns),
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(handle_unknown='ignore')
        )),
        ("boolean_features", make_pipeline(
            FeatureSelector(binary_features),
        ))
    ])
)

In [None]:
model_pipeline = Pipeline([
    ('prep', feature_prep_pipeline),
    ('model', RandomForestClassifier(random_state = 42)),
])
model_pipeline.fit(X_train, y_train)
pred = np.array(model_pipeline.predict(X_test))
pred2 = np.array(model_pipeline.predict_proba(X_test))
log_loss(y_test,pred2)

## CatBoost

In [None]:
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train.values.reshape(-1))
class_weights = {k:v for k,v in zip(classes, class_weights)}
class_weights

In [None]:
class Filter:
    def __init__(self, column_name):
        self.ids = None
        self.allowed_features=None
        self.column_name = column_name
        self.fill_val = None
        
    def by_treshold(self, data, count_threshold, fill_val):
        self.fill_val = fill_val
        counts = pd.DataFrame(data[self.column_name].value_counts()).reset_index()
        counts = counts.rename(columns={'index': 'id', self.column_name: 'count'})
        self.ids = counts.loc[counts['count'] > count_threshold]
        data[self.column_name] = data.where(data[self.column_name].isin(self.ids['id']), other=self.fill_val)[self.column_name]
        print(f'{self.column_name} size reduced from {len(counts)} to {len(self.ids)}')
        return data
    
    def add_from_lists(self, data, min_count_features=1):
        data.reset_index(inplace=True)
        feature_counts = Counter([feat for feat_list in data[self.column_name].values for feat in feat_list])
        self.allowed_features = [k for k,v in feature_counts.items() if v > min_count_features]
        data = pd.concat([data,*[pd.Series(np.zeros(len(data)), name=f'{self.column_name}_{feat}') for feat in self.allowed_features]], axis=1)
        for i, row in data.iterrows():
            for j in row[self.column_name]:
                if j in self.allowed_features:
                    data.at[i, f'{self.column_name}_{j}'] = 1.0
        return data
    
    def transform_treshold(self, data):
        data[self.column_name] = data.where(data[self.column_name].isin(self.ids['id']), other=self.fill_val)[self.column_name]
        return data
    
    def transform_lists(self, test_data):
        test_data.reset_index(inplace=True)
        test_data = pd.concat([test_data,*[pd.Series(np.zeros(len(test_data)), name=f'{self.column_name}_{feat}') for feat in self.allowed_features]], axis=1)
        for i, row in test_data.iterrows():
            for j in row[self.column_name]:
                if j in self.allowed_features:
                    test_data.at[i, f'{self.column_name}_{j}'] = 1.0
        return test_data

class My_model:
    def __init__(self, count_threshold, max_price):
        self.model=None
        self.count_threshold=count_threshold
        self.max_price=max_price
        self.scaler=None
        self.target = ['interest_level']
        self.categorical_columns = ['building_id', 'display_address', 'manager_id', 'street_address', 'hour']
        self.continuous_columns = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price']
        self.features = None
        
        self.building_id_filter = Filter('building_id')
        self.display_address_filter = Filter('display_address')
        self.manager_id_filter = Filter('manager_id')
        self.street_address_filter = Filter('street_address')
        self.features_filter = Filter('features')
        
    def prepare(self, df):
        df[["created"]] = df[["created"]].apply(pd.to_datetime)
        df = df.loc[(df['longitude'] > -120) & (df['longitude'] < -60) &
                    (df['latitude'] < 50) & (df['latitude'] > 20)]
        df = df.loc[df['price'] < self.max_price]
        df['hour'] = df['created'].apply(lambda x: x.hour)
        df = self.building_id_filter.by_treshold(df, self.count_threshold,'0')
        df = self.display_address_filter.by_treshold(df, self.count_threshold,'')
        df = self.manager_id_filter.by_treshold(df,self.count_threshold, '0')
        df = self.street_address_filter.by_treshold(df,self.count_threshold, '')
        df = self.features_filter.add_from_lists(df, 1)
        exclude_features = ['created', 'photos', 'features','description']
        df.drop(columns=exclude_features, inplace=True)
        return df

    def predict(self, df_test):
        test_df = df_test.copy(deep=True)
        
        test_df[["created"]] = test_df[["created"]].apply(pd.to_datetime)
        test_df['hour'] = test_df['created'].apply(lambda x: x.hour)
        test_df = self.building_id_filter.transform_treshold(test_df)
        test_df = self.display_address_filter.transform_treshold(test_df)
        test_df = self.manager_id_filter.transform_treshold(test_df)
        test_df = self.street_address_filter.transform_treshold(test_df)
        test_df = self.features_filter.transform_lists(test_df)
        exclude_features = ['created', 'photos', 'features','description']
        test_df.drop(columns=exclude_features, inplace=True)
        test_df[self.continuous_columns] = self.scaler.transform(test_df[self.continuous_columns])
        
        test_pool = Pool(data=test_df[self.features], cat_features=self.categorical_columns)
        pred = self.model.predict_proba(test_pool)
        pred = pd.concat([test_df['listing_id'], pd.DataFrame(pred)], axis=1)
        return pred

    def run(self, df, class_weights=None):
        df = self.prepare(df)

        self.target = ['interest_level']
        self.features = [feat for feat in df.columns if feat not in [self.target[0],'listing_id']]
        self.categorical_columns = ['building_id', 'display_address', 'manager_id', 'street_address', 'hour']
        self.continuous_columns = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price']
        #text_features = ['description']

        target_num_map = {'high':0, 'medium':1, 'low':2}
        y = df[self.target]
        X_train, X_test, y_train, y_test = train_test_split(df[self.features], y, random_state=0, stratify=y)

        self.scaler = StandardScaler()
        X_train[self.continuous_columns] = self.scaler.fit_transform(X_train[self.continuous_columns])
        X_test[self.continuous_columns] = self.scaler.transform(X_test[self.continuous_columns])

        catboost_default_params = {
            'silent':True, 
            'random_state':21, 
            'early_stopping_rounds':50,
            'iterations' : 300,
            'loss_function' : 'MultiClass',
            #'classes_count' : 3,
            'thread_count' : -1
        } 
        if class_weights:
            catboost_default_params['class_weights'] = class_weights
        train_pool = Pool(data=X_train, label=y_train, cat_features=self.categorical_columns)
        test_pool = Pool(data=X_test, label=y_test,cat_features=self.categorical_columns)
        self.model = CatBoostClassifier( **catboost_default_params)
        #result = model.grid_search(grid, X=X_train, y=y_train, plot=True, cv=3)
        self.model.fit(train_pool, plot=True, eval_set=test_pool)
        pred = np.array(self.model.predict_proba(X_test))
        result = log_loss(y_test,pred)

        return result

In [None]:
df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip').reset_index()

my_model = My_model(3, 70000)
result = my_model.run(df)
print(result)

In [None]:
test = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/test.json.zip').reset_index()
test_prediction = my_model.predict(test)

In [None]:
sub_gs = pd.read_csv('../input/two-sigma-connect-rental-listing-inquiries/sample_submission.csv.zip')

sub_gs['high'] = test_prediction[0]
sub_gs['medium'] = test_prediction[1]
sub_gs['low'] = test_prediction[2]

sub_gs.head()
sub_gs.to_csv('submission.csv', index = False)

In [None]:
# test_prediction.rename(columns={0:'high', 1:'medium', 2:'low'}, inplace=True)
# sub_gs = pd.read_csv('../input/two-sigma-connect-rental-listing-inquiries/sample_submission.csv.zip')
# pd.merge(sub_gs.drop(columns=['high', 'medium', 'low']), test_prediction, on='listing_id').to_csv('submission_final.csv', index = False)