In [None]:
import numpy as np
import pandas as pd

import nltk
import re
import string
import geopy.distance
from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [None]:
train_df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip')
# test_df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/test.json.zip')

In [None]:
def create_features(train_df):
    '''
    Cоздадим простые дополнительные признаки
    '''
    train_df["num_photos"] = train_df["photos"].apply(len)
    train_df["num_features"] = train_df["features"].apply(len)
    train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
    train_df["created"] = pd.to_datetime(train_df["created"], format='%Y-%m-%d %H:%M:%S')
    train_df["created_month"] = train_df["created"].dt.month
    train_df["created_day"] = train_df["created"].dt.day
    train_df['created_dayofweek']=train_df['created'].apply(lambda x:x.dayofweek)
    train_df['logprice'] = np.log(train_df['price'])
    train_df.drop(['created'], axis = 1)
    train_df['rooms'] = train_df['bathrooms'] + train_df['bedrooms'] + 1 
    train_df['price_per_room'] = train_df['price'] / train_df['rooms']
    
def create_geo_features(df):
    '''
    Считаем расстояние до достопримечательностей Нью Йорка
    '''
    cords = (40.70664160590731, -74.00889639688603)
    df['distance_to_WallStreet'] = df[['latitude', 'longitude']].apply(lambda x:geopy.distance.geodesic((x[0], x[1]), cords).m, axis=1)
    cords = (40.748477220505485, -73.98566751076079)
    df['distance_to_EmpireStateBuildng'] = df[['latitude', 'longitude']].apply(lambda x:geopy.distance.geodesic((x[0], x[1]), cords).m, axis=1)
    cords = (40.68941208336766, -74.04443602895553)
    df['distance_to_Statue'] = df[['latitude', 'longitude']].apply(lambda x:geopy.distance.geodesic((x[0], x[1]), cords).m, axis=1)
    cords = (40.780814403367714, -73.96735634584424)
    df['distance_to_CentralPark'] = df[['latitude', 'longitude']].apply(lambda x:geopy.distance.geodesic((x[0], x[1]), cords).m, axis=1)
    cords = (40.758096515338465, -73.9855748013695)
    df['distance_to_TimesSquare'] = df[['latitude', 'longitude']].apply(lambda x:geopy.distance.geodesic((x[0], x[1]), cords).m, axis=1)  
    
def extra_description_info(df):
    '''
    Позволяет извлечь дополнительную информацию из описания,
    Я полагаю такие символы и заглавные буквы используются для привлечения внимания
    к объявлению
    '''
    df['has_phone'] = df['description'].apply(lambda x:re.sub('['+string.punctuation+']', '', x).split())\
            .apply(lambda x: [s for s in x if s.isdigit()])\
            .apply(lambda x: len([s for s in x if len(str(s))==10]))\
            .apply(lambda x: 1 if x>0 else 0)

    df['has_email'] = df['description'].apply(lambda x: 1 if '@renthop.com' in x else 0)

    df['num_of_#'] = df.description.apply(lambda x:x.count('#'))
    df['num_of_!'] = df.description.apply(lambda x:x.count('!'))
    df['num_of_$'] = df.description.apply(lambda x:x.count('$'))
    df['num_of_*'] = df.description.apply(lambda x:x.count('*'))
    df['num_of_>'] = df.description.apply(lambda x:x.count('>'))

    df['num_of_puncs'] = df['num_of_#'] + df['num_of_!'] + df['num_of_$'] + df['num_of_*'] + df['num_of_>']
    df['upper_char_ratio'] = df['description'].apply(lambda x: 0 if sum([s.isalpha() for s in x])==0 else sum([s.isalpha()&s.isupper() for s in x])/ sum([s.isalpha() for s in x]))

def preprocessor(text, signs=True, digits=True, lowercase=True):# удаляет знаки препинания, цифры, приводит всё в нижний регистр
    '''
    Удаляет знаки препинания из текста
    Удаляет цифры
    Приводит текст в нижний регистр
    '''
    if(signs):
        text = re.sub('<[^>]*>', '', text)
    if(digits):
        text = re.sub("\d", '', text)# удалить цифры
    if(lowercase):  
        text = re.sub('[\W]+', ' ', text.lower())# +\
    return text

def delStopWords(text):# удаление стоп слов
    '''
    Удаляет стоп слова английского языка
    '''
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.tokenize.word_tokenize(text)
    words = [w for w in tokens if w not in stopwords]  
    return ' '.join(list(words))

def denoise_text_for_description(text):
    '''
    Предобработка текста для описания
    '''
    text = preprocessor(text, signs=True, digits=True, lowercase=True)
    text = delStopWords(text)
    return text

def denoise_text_for_features(text):
    '''
    Предобработка текста для признаков
    '''
    text = preprocessor(text, signs=False, digits=True, lowercase=True)
    text = delStopWords(text)
    return text

def adress_prep(df, column_name):
    '''
    Предобработка адресов
    Приводит в нижний регистр, заменяет сокращения для east, west, street, avenue
    '''
    df[column_name] = df[column_name].apply(lambda x: x.str.lower())
    df[column_name] = df[column_name].replace(['\sst\s', '\sst$'], ' street', regex = True)
    df[column_name] = df[column_name].replace(['\save\s', '\save$'], ' avenue', regex = True)
    df[column_name] = df[column_name].replace(['\se\s', '^e\s'], ' east ', regex = True)
    df[column_name] = df[column_name].replace(['\sw\s', '^w\s'], ' west ', regex = True)
    df[column_name] = df[column_name].replace(' ', '')
    
def manager_prep(df):  
    pass

def building_prep(df):
    pass

In [None]:
%%time
create_features(train_df)
# create_features(test_df)

In [None]:
%%time
extra_description_info(train_df)
# extra_description_info(test_df)

In [None]:
%%time
create_geo_features(train_df)
# create_geo_features(test_df)

In [None]:
%%time
# Применяем обработку текстов к столбцу описание
train_df['description']=train_df['description'].apply(denoise_text_for_description)
# test_df['description']=test_df['description'].apply(denoise_text_for_description)

In [None]:
%%time
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
train_df['features'] = train_df['features'].str.replace('-','_') # Pre-war -> pre_war

# test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
# test_df['features'] = test_df['features'].str.replace('-','_')

In [None]:
%%time
# Применяем обработку текстов к столбцу features
train_df['features']=train_df['features'].apply(denoise_text_for_features)
# test_df['features']=test_df['features'].apply(denoise_text_for_features)

In [None]:
%%time
adress_prep(train_df, ['display_address', 'street_address'])
# adress_prep(test_df, ['display_address', 'street_address'])

In [None]:
# Применим простые способы векторизации текстов tfidf для описаний count для features
tfv_description = TfidfVectorizer(min_df=10,
                      max_features=200,
                      strip_accents='unicode',
                      lowercase = False,
                      analyzer='word',
                      stop_words='english', 
                      ngram_range=(2, 3),
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1
                     )

count_vec = CountVectorizer(max_features = 200,
                            strip_accents='unicode',
                            stop_words='english',
                            lowercase = False,
                            analyzer='word')

In [None]:
%%time
tfidf_description_train = tfv_description.fit_transform(train_df["description"]).toarray()
# tfidf_description_test = tfv_description.transform(test_df["description"]).toarray()

In [None]:
%%time
count_vec_features_train = count_vec.fit_transform(train_df["features"]).toarray()
# count_vec_features_test = count_vec.transform(test_df["features"]).toarray()

Закодируем номинальные категориальные переменные используя labelencoder, это не лучшее решение потому, что в таком случае можно  будет сравнивать между собой street_аddress, building_id, display_address, manager_id, можно было бы извлечь информацию из адреса и id здания, но для этого требуется более глубокий анализ. Некоторые объявления имеют одинаковый building_id, при этом у них отличаются адреса и координаты. Но эксперименты показали, что если убрать эти признаки совсем результат будет хуже.

In [None]:
for column_name in ['building_id', 'manager_id', 'display_address', 'street_address']:
    label_encoder = LabelEncoder()
    label_encoder.fit(list(train_df[column_name].values))# + list(test_df[column_name].values))
    train_df[column_name] = label_encoder.transform(train_df[column_name].values)
    # test_df[column_name] = label_encoder.transform(test_df[column_name].values)

# Manager processing
https://www.kaggle.com/code/den3b81/improve-perfomances-using-manager-features
Нельзя использовать данные и тестового датасета, поэтому извлекаем данные из тренировочного и используем их для тестового
для менеджеров, которые не встречались в тренирочном наборе и для менеджеров у которых мало объявлений (<20) использую
среднии значения

In [None]:
# тренировочный набор данных
X = train_df.drop(columns = ["interest_level"], axis = 1)
y = train_df["interest_level"]

temp = pd.concat([X.manager_id,pd.get_dummies(y)], axis = 1).groupby('manager_id').mean() # вычисляет процент от общего числа объявлений менеджера для каждой метки класса 
temp.columns = ['high_frac','low_frac', 'medium_frac']
temp['count'] = X.groupby('manager_id').count().iloc[:,1]# количество объявлений

temp['manager_skill'] = temp['high_frac']*2 + temp['medium_frac'] # вычисяем навык менеджера
unranked_managers_ixes = temp['count'] < 20
ranked_managers_ixes = ~unranked_managers_ixes
mean_values = temp.loc[ranked_managers_ixes, ['high_frac','low_frac', 'medium_frac','manager_skill']].mean()
temp.loc[unranked_managers_ixes,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values

train_df = train_df.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')

# тестовый набор данных
# test_df = test_df.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')
# new_manager_ixes = test_df['high_frac'].isnull()
# test_df.loc[new_manager_ixes,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values

In [None]:
interest_level_map = {'high': 0, 'medium': 1, 'low': 2}
train_df['interest_level'] = train_df['interest_level'].map(interest_level_map)

In [None]:
train_df = train_df.drop(['latitude', 'longitude', 'photos', 'created', 'description', 'features', 'listing_id', 'manager_id'], axis = 1)
# test_df = test_df.drop(['latitude', 'longitude', 'photos', 'created', 'description', 'features', 'manager_id'], axis = 1)

In [None]:
X_train = pd.concat([train_df.reset_index(), pd.DataFrame(count_vec_features_train, columns=count_vec.get_feature_names())], axis=1)
X_train = pd.concat([X_train.reset_index(), pd.DataFrame(tfidf_description_train, columns=tfv_description.get_feature_names())], axis=1)

# X_test = pd.concat([test_df.reset_index(), pd.DataFrame(count_vec_features_test, columns=count_vec.get_feature_names())], axis=1)
# X_test = pd.concat([X_test.reset_index(), pd.DataFrame(tfidf_description_test, columns=tfv_description.get_feature_names())], axis=1)

Y_train = X_train['interest_level']
X_train = X_train.drop(['interest_level'], axis = 1) 

X_train = X_train.drop(['level_0', 'index'], axis = 1)
# X_test = X_test.drop(['level_0', 'index'], axis = 1)

In [None]:
# %%time
# param_grid_xgb = {'min_child_weight': [1, 5],
#                   'subsample': [0.6, 0.7],
#                   'eval_metric': ['mlogloss'],
#                   'gamma': [0, 1],
#                   'max_depth': [6, 7],
#                   'learning_rate' : [0.5,0.1],
#                   'colsample_bytree': [0.7],
#                    'n_estimators': [1000]
#                  } 

# model =  xgb.XGBClassifier()

# # Run grid search 
# grid = GridSearchCV(model,
#                     cv = 3,
#                     param_grid = param_grid_xgb,
#                     refit = True,
#                     verbose = 1,
#                     n_jobs = -1) 


# # fit the model for grid search 
# grid.fit(X_train, Y_train)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train,
                                                    test_size = 0.2,
                                                    random_state = 0)

In [None]:
scaler=StandardScaler()
scaler.fit(X_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
%%time
params = {'colsample_bytree': 0.7,
          'eval_metric': 'mlogloss',
          'gamma': 0,
          'learning_rate': 0.5,
          'max_depth': 7,
          'min_child_weight': 5,
          'n_estimators': 1000,
          'subsample': 0.7}

model =  xgb.XGBClassifier(params)
model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import log_loss

y_val_pred = model.predict_proba(x_test)
# 0.55
# 0.57
# 0.5669
# 0.542566888770270
print(log_loss(y_test, y_val_pred))

In [None]:
listing_ids = X_test['listing_id']
X_test = X_test.drop(['listing_id'], axis = 1)

y_pred_prod = model.predict_proba(X_test)

submit = pd.DataFrame(data={'listing_id': listing_ids,
                            'high': y_pred_prod[:, 0],
                            'medium': y_pred_prod[:, 1], 
                            'low': y_pred_prod[:, 2]})
submit.to_csv('submit.csv', index=False)