In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
import string
import re
import collections
from sklearn import  preprocessing
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import make_scorer, f1_score, accuracy_score, mean_absolute_error, confusion_matrix
import optuna
from lofo import LOFOImportance, Dataset, plot_importance ## to install !pip install lofo-importance
%matplotlib inline
import itertools


In [None]:
# READ DATA 
train_df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/train.json.zip')
test_df = pd.read_json('../input/two-sigma-connect-rental-listing-inquiries/test.json.zip')

In [None]:
train_df.info()

**Our target 'INTEREST LEVEL' is an object as we can see above.

**Let's convert to the numeric to analyze easily

* 0 : low
* 1 : medium
* 2 : high

In [None]:
train_df['target'] = train_df['interest_level'].apply(lambda x: 0 if x=='low' 
                                                      else 1 if x=='medium' 
                                                      else 2) 
# train_df['low'] = train_df['interest_level'].apply(lambda x: 1 if x=='low' else 0)
# train_df['medium'] = train_df['interest_level'].apply(lambda x: 1 if x=='medium' else 0)
# train_df['high'] = train_df['interest_level'].apply(lambda x: 1 if x=='high' else 0)

### BASIC FEATURES

In [None]:
train_df['description'].iloc[0]

In [None]:
# REMOVE UNNECESSARY WORDS FROM DESCRIPTION
train_df['description'] = train_df['description'].apply(lambda x: x.replace("<br />", ""))
train_df['description'] = train_df['description'].apply(lambda x: x.replace("br", ""))
train_df['description'] = train_df['description'].apply(lambda x: x.replace("<p><a", ""))

In [None]:
print(train_df['description'].iloc[0])

In [None]:
#basic features
train_df['rooms'] = train_df['bedrooms'] + train_df['bathrooms'] 

# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))

# description contains email
regex = r'[\w\.-]+@[\w\.-]+'
train_df['has_email'] = train_df['description'].apply(lambda x: 1 if re.findall(regex, x) else 0)

# description contains phone
# description contains phone
train_df['has_phone'] = train_df['description'].apply(lambda x:re.sub('['+string.punctuation+']', '', x).split())\
        .apply(lambda x: [s for s in x if s.isdigit()])\
        .apply(lambda x: len([s for s in x if len(str(s))==10]))\
        .apply(lambda x: 1 if x>0 else 0)

# CONVERT LOWER ALL OF WORDS
train_df[["features"]] = train_df[["features"]].apply(
    lambda _: [list(map(str.strip, map(str.lower, x))) for x in _])

### APPLY SAME OPERATIONS TO THE TEST DATA

In [None]:
# REMOVE UNNECESSARY WORDS FROM DESCRIPTION
test_df['description'] = test_df['description'].apply(lambda x: x.replace("<br />", ""))
test_df['description'] = test_df['description'].apply(lambda x: x.replace("br", ""))
test_df['description'] = test_df['description'].apply(lambda x: x.replace("<p><a", ""))

# FEATURE ENGINEERING
#basic features
test_df['rooms'] = test_df['bedrooms'] + test_df['bathrooms'] 

# count of photos #
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# description contains email
regex = r'[\w\.-]+@[\w\.-]+'
test_df['has_email'] = test_df['description'].apply(lambda x: 1 if re.findall(regex, x) else 0)

# description contains phone
test_df['has_phone'] = test_df['description'].apply(lambda x:re.sub('['+string.punctuation+']', '', x).split())\
        .apply(lambda x: [s for s in x if s.isdigit()])\
        .apply(lambda x: len([s for s in x if len(str(s))==10]))\
        .apply(lambda x: 1 if x>0 else 0)

# CONVERT LOWER ALL OF WORDS
test_df[["features"]] = test_df[["features"]].apply(
    lambda _: [list(map(str.strip, map(str.lower, x))) for x in _])

### MOST FREQUENT FEATURES EXTRACTION

In [None]:
feature_value_train = train_df['features'].tolist()
feature_value_test = test_df['features'].tolist()

feature_value_train
feature_value_test

feature_lst_train = []
feature_lst_test = []

for i in range(len(feature_value_train)):
    feature_lst_train += feature_value_train[i]
    
for i in range(len(feature_value_test)):
    feature_lst_test += feature_value_test[i]
# print(len(feature_lst)) # all features

uniq_feature_train = list(set(feature_lst_train))
uniq_feature_test = list(set(feature_lst_test))

# print(uniq_feature) #all unique features
len(uniq_feature_train)
len(uniq_feature_test)

In [None]:
# see the frequency of each feature
def most_common(lst):
    features = collections.Counter(lst)
    feature_value = features.keys()
    frequency = features.values()
    data = [('feature_value', feature_value),
            ('frequency', frequency),]    
    df = pd.DataFrame.from_dict(dict(data))
    return df.sort_values(by = 'frequency', ascending = False)

df_features_train = most_common(feature_lst_train)
df_features_test = most_common(feature_lst_test)

df_features_train
df_features_test

In [None]:
def newColumn(name, df, series):
    feature = pd.Series(0,df.index,name = name)# data : 0
    for row,word in enumerate(series):
        if name in word:
            feature.iloc[row] = 1
    df[name] = feature # feature : series ; value in series : 1 or 0
    return df

# select features based on frequency
facilities = ['elevator', 'cats allowed', 'hardwood floors', 'dogs allowed', 'doorman', 'dishwasher', 'no fee', 'laundry in building', 'fitness center']
for name in facilities:
    train_df = newColumn(name, train_df, train_df['features'])
    test_df = newColumn(name, test_df, test_df['features'])

### Features after extraction

In [None]:
print(train_df['features'].iloc[0])

### DATA VISUALIZATION

In [None]:
plt.figure(figsize=(8,4))
colors = ['lightcoral','gold','lightblue']
sns.countplot(train_df['interest_level'], alpha=0.8)
plt.title("INTEREST LEVEL COMPARE")
plt.xlabel('Interest level', fontsize=12)
plt.show()

- As we can see low level is highly more than other interest levels

In [None]:
plt.style.use("seaborn-whitegrid")
plt.figure(figsize=(12,8))
plt.title("CORRELATION BETWEEN NUMERICAL VALUES")
num_col = ["rooms", "num_photos", "num_features", "has_email", "has_phone", "price", "target"]
sns.heatmap(train_df[num_col].corr(), annot = True, fmt = ".2f")
plt.show()

In [None]:
plt.style.use("seaborn-whitegrid")
plt.figure(figsize=(12,8))
plt.title("CORRELATION BETWEEN NUMERICAL VALUES")
num_col = ['elevator', 'cats allowed', 'hardwood floors', 'dogs allowed', 'doorman', 'dishwasher', 'no fee', 'laundry in building', 'fitness center', 'target']
sns.heatmap(train_df[num_col].corr(), annot = True, fmt = ".2f")
plt.show()

In [None]:
### Rent interest graph of New-York
sns.lmplot(x="longitude", y="latitude", fit_reg=False, hue='interest_level',
           hue_order=['low', 'medium', 'high'], size=9, scatter_kws={'alpha':0.4,'s':30},
           data=train_df[(train_df.longitude>train_df.longitude.quantile(0.1))
                        &(train_df.longitude<train_df.longitude.quantile(0.9))
                        &(train_df.latitude>train_df.latitude.quantile(0.1))                           
                        &(train_df.latitude<train_df.latitude.quantile(0.9))]);
plt.xlabel('Longitude');
plt.ylabel('Latitude');

In [None]:
### Price exploration
prices=train_df.groupby('interest_level', as_index=False)['price'].mean()
colors = ['lightcoral','gold','lightblue']

fig=plt.figure(figsize=(8,6))
plt.bar(prices.interest_level, prices.price, color=colors, width=0.5, alpha=0.8)
#set titles
plt.xlabel('Interest level')
plt.ylabel('Average price')
plt.title('Average price across interest level')
plt.show()

In [None]:
train_df.groupby(['building_id', 'manager_id', 'interest_level']).count()

- WORDCLOUD SHOWS US MOST FREQUENT WORDS IN THE DATASET, DEPENDS ON THE FREQUENCY WORDS SIZE IS GETTING BIGGER

In [None]:
#WORDCLOUD FOR DESCRIPTION AND DISPLAY ADDRESS
#Preprocessing
text = ''
text_da = ''
text_desc = ''
text_str = ''
for ind, row in train_df.iterrows():
    for feature in row['features']:
        text = " ".join([text, "_".join(feature.strip().split(" "))])
    text_da = " ".join([text_da,"_".join(row['display_address'].strip().split(" "))])
    text_desc = " ".join([text_desc, row['description']])
    text_str = " ".join([text_str, row['street_address']])
text = text.strip()
text_da = text_da.strip()
text_desc = text_desc.strip()
text_str = text_str.strip()


# wordcloud for features
plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for features", fontsize=30)
plt.axis("off")
plt.show()


# wordcloud for display address
plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_da)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for Display Address", fontsize=30)
plt.axis("off")
plt.show()


# wordcloud for description
plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_desc)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for Description", fontsize=30)
plt.axis("off")
plt.show()

# wordcloud for street address
plt.figure(figsize=(12,6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40).generate(text_str)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for Street Address", fontsize=30)
plt.axis("off")
plt.show()

### DROP UNNECESSARY COLUMNS

In [None]:
# TRAINING DATASET
train_df.drop('interest_level', axis=1, inplace=True)
train_df.drop('created', axis=1, inplace=True)
train_df.drop('description', axis=1, inplace=True)
train_df.drop('features', axis=1, inplace=True)
train_df.drop('photos', axis=1, inplace=True)

# TEST DATASET
test_df.drop('created', axis=1, inplace=True)
test_df.drop('description', axis=1, inplace=True)
test_df.drop('features', axis=1, inplace=True)
test_df.drop('photos', axis=1, inplace=True)

### LABEL ECONDING FOR CATEGORICAL VARIABLES

In [None]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))

## XGBOOST

In [None]:
X = train_df.drop(['target'], axis = 1)
y = train_df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = .3,
                                                    random_state = 5,
                                                   stratify = y)

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_test, label=y_test)

In [None]:
kf = KFold(n_splits=5, shuffle=False)

X_train = X_train.values
y_train = y_train.values
scores = []

for train, test in kf.split(X_train, y_train):
    model = XGBClassifier(n_estimators=1000, learning_rate=0.05, max_depth = 10)
    model.fit(X_train[train], y_train[train])
    scores.append(model.score(X_train[test], y_train[test]))

In [None]:
def objective(trial):
    params = {
        'booster':trial.suggest_categorical('booster', ['gbtree', 'dart', 'gblinear']),
        'learning_rate':trial.suggest_loguniform("learning_rate", 0.01, 0.1),
        'max_depth':trial.suggest_int("max_depth", 3, 11),
        'subsample':trial.suggest_uniform("subsample", 0.0, 1.0),
        'colsample_bytree':trial.suggest_uniform("colsample_bytree", 0.0, 1.0),
    }

    model = XGBClassifier(**params)
    cv = KFold(n_splits=3, shuffle=True, random_state=None)
    scorer = make_scorer(f1_score, greater_is_better=True)
    
    bst = xgb.train(params, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    f1_scores = f1_score(y_test, pred_labels, average='micro')
    return f1_scores

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=600)

In [None]:
new_params = study.best_params

new_model = XGBClassifier(**new_params)
new_model.fit(X, y)
preds = new_model.predict(X_test)

print('Optimized SuperLearner accuracy: ', accuracy_score(y_test, preds))
print('Optimized SuperLearner f1-score: ', f1_score(y_test, preds, average='micro'))

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
print("All of accuracies")
print(scores)

print("Mean of accuracies")
print(np.mean(scores))

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
y_pred = new_model.predict(X)
cm = confusion_matrix(train_df['target'], y_pred)
np.set_printoptions(precision=2)

class_names = ['low', 'medium', 'high']
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cm, classes=class_names,
                      title='Confusion matrix')

### FEATURE IMPORTANCE BY LOFO

In [None]:
# define the validation scheme
cv = KFold(n_splits=4, shuffle=True, random_state=0)
scorer = make_scorer(mean_absolute_error, greater_is_better=False)
# define the binary target and the features
target = "target"
features = [col for col in train_df.columns if col != target]
dataset = Dataset(df=train_df, target=target, features=features)
# define the validation scheme and scorer. The default model is LightGBM
lofo_imp = LOFOImportance(dataset, scoring=scorer, model=new_model, cv=cv)

# get the mean and standard deviation of the importances in pandas format
importance_df = lofo_imp.get_importance()

# plot the means and standard deviations of the importances
plot_importance(importance_df)