In [1]:
from sklearn.pipeline import Pipeline
from functools import partial
import numpy as np
import re
import pandas as pd
import xgboost
import pickle
import copy
from scipy.spatial import distance
from functools import partial
import re,string
import json



In [68]:
class GenerateDescriptionFeature():
    """
    Generate Col features
    """
    def __init__(self, fromCol, dictionary):
        self.dict = copy.copy(dictionary)
        self.fromCol = fromCol
    
    def transform(self, DF):
        """
        Add feature cols
        """
        print("Creating Feature from unit features")
        for string in self.dict.keys():
            PartialSearch = partial(self._SearchStringInRow, string=string)
            DF[self.dict[string]] = DF[[self.fromCol]].applymap(PartialSearch)
        return DF
    
    def fit(self, DF):
        return self.transform(DF)
    
    def _SearchStringInRow(self, entry, string):
        """
        Search if any element in an entry contains string pattern
        """
        return any(list(filter(lambda x:bool(re.search(string,x.lower())), entry)))

In [88]:
class GenerateNearestNeighborFeature():
    """
    Find the nearest neighbor and calculate the desired statistics within neighbors
    """
    def __init__(self, initDF, positionCol, valueCol, maxDist, funs=np.mean, metric='chebyshev'):
        self.position = copy.copy(initDF[positionCol])
        self.value = copy.copy(initDF[valueCol])
        self.positionCol = positionCol
        self.valueCol = valueCol
        self.metric = metric
        self.maxDist = maxDist
        self.funs = funs
        
    def transform(self, DF):
        """
        Calculate neighbors
        """
        print("Generate nearest neighbor features")
        pairwise = distance.cdist(np.array(DF[self.positionCol]), \
                          np.array(self.position), metric=self.metric)
        print("Pairwise distance calculated")
        indexNeighbor = [x < self.maxDist for x in pairwise]
        print("Neightbor list obtained")
        del pairwise
        newColName = 'feature_{0}_{1}_nbr_{2}'.format(self.valueCol, self.funs.__name__, self.maxDist)
        DF[newColName] = [self.funs(DF[x][self.valueCol]) for x in indexNeighbor]
        return DF
    
    def fit(self, DF):
        return self.transform(DF)

In [70]:
class GenerateRawColAsFeature():
    """
    Simply Add New col with suitable names as feature
    """
    def __init__(self, fromcols, featureColHeader = "feature_"):
        self.featureColHeader = featureColHeader
        self.fromcols = fromcols
    
    def transform(self, DF):
        print("Use Raw Columns as Feature")
        newNameCols = [self.featureColHeader + x for x in self.fromcols]
        DF[newNameCols] = DF[self.fromcols]
        return DF
    
    def fit(self, DF):
        return self.transform(DF)

In [71]:
class KeepFeature():
    """
    Only keep feature Cols
    """
    def __init__(self, featureColHeader = "feature_"):
        self.featureColHeader = featureColHeader
        
    def transform(self, DF):
        keepCols = list(filter(lambda x:re.search("^{0}".format(self.featureColHeader), x), list(DF.columns)))
        return DF[keepCols]
    
    def fit(self, DF):
        return self.transform(DF)

In [91]:
train_path = "/home/weiwen/Documents/projects/Kaggle/rental_listing_inquiries/data/train.json"
with open(train_path) as file:
    train = json.load(file)

def ConvertJsonToDF(json, cols=None):
    if cols:
        assert cols < list(json.keys())
    else:
        cols = list(json.keys())
    # Validation Fun
    rowKey = list(json[cols[0]].keys())
    returnDF = pd.DataFrame(index=rowKey, columns=cols)
    for col in cols:
        returnDF[col] = list(json[col].values())
    return returnDF

trainDF = ConvertJsonToDF(train)
interest_cat_to_int = {'medium':1, 'low':0, 'high':2}
trainDF['interest_level'] = [interest_cat_to_int[x] for x in trainDF['interest_level']]

In [73]:
featureDict = {"laundry":"feature_laundry", \
               "prewar|pre-war":"feature_prewar", \
               "^cats* | cats* |^dogs* | dogs* |^pets*| pets* !no":"feature_petfriendly", \
               "no fee":"feature_nofee", \
               "elevator":"feature_elevator", \
               "wood":"feature_woodenfloor", \
               "garden|patio":"feature_gardenpatio", \
               "dishwasher":"feature_dishwasher", \
               "fitness":"feature_fitness", \
               "dining":"feature_diningroom", \
               "pool":"feature_pool", \
               "garage":"feature_garage", \
               "doorman":"feature_doorman"}
GenerateDescriptionFeatureParamed = GenerateDescriptionFeature(dictionary=featureDict, fromCol='features')

In [92]:
GenerateNearestNeighborFeatureParamed = GenerateNearestNeighborFeature(initDF=trainDF, \
                                                                       positionCol=['latitude', 'longitude'], \
                                                                       valueCol='interest_level', \
                                                                       maxDist=0.001
                                                                       )

In [93]:
GenerateNearestNeighborFeatureParamed.valueCol

'interest_level'

In [76]:
GenerateRawColAsFeatureParamed = GenerateRawColAsFeature(fromcols=["bathrooms", "bedrooms","price"])
KeepFeatureParamed = KeepFeature()

In [85]:
modelPipeline = Pipeline([('unitfeature', GenerateDescriptionFeatureParamed), \
         ('nearestneighbor', GenerateNearestNeighborFeatureParamed), \
         ('rawcol', GenerateRawColAsFeatureParamed), \
         ('filtercol', KeepFeatureParamed)])

In [97]:
# trainDF2 = modelPipeline.transform(trainDF)
with open("train_X.pickle", "rb") as file:
    train_X = pickle.load(file)

In [116]:
xgb = xgboost.XGBClassifier(max_depth=3, n_estimators=200, learning_rate=0.05)

In [100]:
train_y = trainDF['interest_level']

In [114]:
from sklearn.model_selection import train_test_split

In [115]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y)

In [117]:
xgb.fit(X=X_train, y=y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [170]:
y_test_pred = xgb.predict(X_test)

In [160]:
y_test_array = np.array(pd.get_dummies(y_test))

In [168]:
logloss= -np.sum(np.multiply(y_test_array, np.log(y_test_pred)))/len(y_test)

In [178]:
failedPrediction = trainDF.loc[y_test.index[y_test_pred!=y_test]]


In [182]:
failedPrediction['predicted'] = y_test_pred[y_test_pred!=y_test]


In [186]:
failedPrediction.to_csv('failed_prediction.csv', )

In [327]:
xgb.feature_importances_

array([ 0.03306878,  0.05767196,  0.01534392,  0.06878307,  0.02142857,
        0.04047619,  0.00343915,  0.01481481,  0.01216931,  0.01772487,
        0.0010582 ,  0.01904762,  0.05925926,  0.42671958,  0.11111111,
        0.0978836 ], dtype=float32)

In [330]:
varImp = pd.DataFrame()
varImp['Variables'] = list(X_train.columns)
varImp['Importance'] = xgb.feature_importances_

Unnamed: 0,Variables,Importance
13,feature_interest_level_mean_nbr_0.001,0.42672
14,feature_bathrooms,0.111111
15,feature_bedrooms,0.097884
3,feature_nofee,0.068783
12,feature_doorman,0.059259
1,feature_prewar,0.057672
5,feature_woodenfloor,0.040476
0,feature_laundry,0.033069
4,feature_elevator,0.021429
11,feature_garage,0.019048


In [261]:
x = np.array(range(10))
y = np.array(range(10,20))
eval("{0}+{1}".format("x", "y"))

array([10, 12, 14, 16, 18, 20, 22, 24, 26, 28])

In [260]:
x+y

array([10, 12, 14, 16, 18, 20, 22, 24, 26, 28])

In [263]:
test = ["DF[{0}]".format(x) for x in ["'col1'", "'col2'"]]

In [264]:
"{0}+{1}".format(*test)

"DF['col1']+DF['col2']"

In [310]:
import feature_funs
from imp import reload
reload(feature_funs)

<module 'feature_funs' from '/home/weiwen/Documents/projects/Kaggle/rental_listing_inquiries/rental-listing-kaggle/FUNS/feature_funs.py'>

In [268]:
test = pd.DataFrame()

In [322]:
test['col1'] = np.array(range(10))
test['col2.01'] = np.array(range(10,20))

In [324]:
GRWF = feature_funs.GenerateRowWiseFeature(fromcols=['col1', 'col2.01'], operations='{0}*{1}**2-{0}', identifier="testCols")

In [325]:
GRWF.transform(test)

SyntaxError: invalid syntax (<string>, line 1)

In [306]:
GRWF.operations.format(*GRWF.fromcols)

'DF.col1*DF.col2'

In [309]:
eval('test.col1*test.col2')

0      0
1     11
2     24
3     39
4     56
5     75
6     96
7    119
8    144
9    171
dtype: int64

In [321]:
testList = ["good", "bad"]
for l in [testList]:
    print(l)

['good', 'bad']


In [339]:
list(map(lambda x:len(x), trainDF['photos']))


[12,
 6,
 6,
 5,
 4,
 5,
 7,
 5,
 4,
 11,
 4,
 5,
 4,
 6,
 10,
 10,
 7,
 7,
 5,
 3,
 4,
 6,
 7,
 6,
 4,
 7,
 4,
 6,
 4,
 4,
 8,
 3,
 3,
 5,
 9,
 13,
 4,
 5,
 8,
 9,
 9,
 8,
 6,
 3,
 6,
 7,
 6,
 12,
 5,
 3,
 5,
 5,
 3,
 5,
 10,
 13,
 8,
 5,
 3,
 1,
 3,
 3,
 5,
 8,
 6,
 7,
 7,
 6,
 5,
 7,
 45,
 1,
 9,
 7,
 5,
 8,
 0,
 1,
 3,
 6,
 6,
 5,
 4,
 6,
 8,
 8,
 2,
 12,
 6,
 4,
 8,
 7,
 4,
 14,
 6,
 4,
 5,
 7,
 0,
 5,
 8,
 12,
 8,
 6,
 7,
 5,
 3,
 6,
 2,
 5,
 10,
 6,
 8,
 7,
 12,
 4,
 4,
 6,
 4,
 7,
 5,
 10,
 7,
 3,
 4,
 3,
 15,
 6,
 7,
 9,
 4,
 3,
 9,
 0,
 6,
 2,
 5,
 5,
 5,
 6,
 6,
 6,
 7,
 4,
 9,
 8,
 15,
 6,
 6,
 6,
 4,
 4,
 3,
 0,
 8,
 8,
 8,
 3,
 9,
 5,
 6,
 6,
 5,
 5,
 12,
 8,
 0,
 6,
 5,
 4,
 3,
 7,
 5,
 6,
 37,
 3,
 5,
 13,
 3,
 4,
 0,
 4,
 5,
 7,
 2,
 0,
 6,
 0,
 7,
 7,
 6,
 3,
 6,
 6,
 0,
 5,
 7,
 5,
 4,
 4,
 5,
 4,
 4,
 6,
 5,
 1,
 8,
 5,
 1,
 4,
 3,
 11,
 6,
 7,
 3,
 6,
 0,
 5,
 6,
 7,
 8,
 0,
 8,
 13,
 8,
 4,
 3,
 7,
 1,
 5,
 9,
 5,
 4,
 0,
 3,
 8,
 1,
 13,
 5,
 2,
 5,
 3,
 6,
 4,
 