In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import label_binarize
from geopy.distance import vincenty
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.spatial import distance
import re
from nltk.stem import PorterStemmer
from nltk.metrics import edit_distance

In [3]:
class RentalListing:
    """ This class performs all the feature engineering and builds the four models (LightGBM, GBM, XGBoost, 
        RandomForest) that we decided along with thier submission and pickle files """
        
    def __init__(self, trainFile, testFile):
        self.trainFile = trainFile
        self.testFile = testFile
        self.__gbm = GradientBoostingClassifier(min_samples_split = 2300, min_samples_leaf = 60, 
                     learning_rate = 0.1, n_estimators = 260, max_depth = 9, max_features = 18)
        self.__lgb = LGBMClassifier(random_state = 10, objective = 'multiclass', learning_rate = 0.07, 
                     n_estimators = 280, max_depth = 10, min_child_samples = 22, 
                     feature_fraction = 0.75, max_bin = 265)
        self.__xgb = XGBClassifier(objective = 'multi:softprob', learning_rate = 0.05, 
                         n_estimators = 1300, max_depth = 3, min_child_weight = 1, subsample = 0.7, 
                         colsample_bytree = 0.85, reg_alpha = 0.075, gamma = 0.3)
        self.__rfc = RandomForestClassifier(n_estimators = 1050, max_depth = 30, min_samples_leaf = 1,
                    min_samples_split = 10, max_features = 'sqrt', n_jobs = -1)
        
        self.train_data = None
        self.train_labels = None
        self.train_labels_pred = None
        self.train_labels_pred_prob = None
        
        self.test_data = None
        self.test_listing_id = None
        self.test_labels = None
        self.test_labels_pred = None
        self.test_labels_pred_prob = None
    
    # read in the data - where magic happens
    def read_data(self):
        train_data = pd.read_json(self.trainFile)
        train_labels = train_data.interest_level
        if (self.testFile is not None):
            test_data = pd.read_json(self.testFile)
            if ('interest_level' in test_data.columns):
                test_labels = test_data.interest_level
            else:
                test_labels = None
        else:
            train_data, test_data, train_labels, test_labels = train_test_split(train_data, 
                            train_data['interest_level'], stratify = train_data['interest_level'], 
                            random_state = 10, test_size = 0.3)
        self.train_data = train_data
        self.train_labels = train_labels
        self.test_data = test_data
        self.test_labels = test_labels
        self.test_listing_id = test_data.listing_id
        
    # static methods - helper methods for feature engineering
    def cap_share(x):
        return sum(1 for c in x if c.isupper())/float(len(x)+1)

    def distance_park(x):
        central_park = (40.7829, -73.9654)
        current = (x[0], x[1])
        return (vincenty(central_park, current).miles) * 100

    def distance_city(x):
        city_center = (40.7128, -74.0060)
        current = (x[0], x[1])
        return (vincenty(city_center, current).miles) * 100

    # feature engineering stuff
    def basicFeatureEngineering(self):
        train_data = self.train_data
        test_data = self.test_data
        
        # number of photos in the listing
        train_data["num_photos"] = train_data.photos.apply(len)
        test_data["num_photos"] = test_data.photos.apply(len)

        # number of features listed
        train_data["num_features"] = train_data.features.apply(len)
        test_data["num_features"] = test_data.features.apply(len)
        
        # total rooms
        train_data["total_rooms"] = train_data["bathrooms"] + train_data["bedrooms"]
        test_data["total_rooms"] = test_data["bathrooms"] + test_data["bedrooms"]
        
        # price per bedroom
        train_data["price_bed"] = train_data["price"] / (train_data["bedrooms"] + 1)
        test_data["price_bed"] = test_data["price"] / (test_data["bedrooms"] + 1)
        
        # date at which listing created - converted to date
        train_data["created"] = pd.to_datetime(train_data.created)
        test_data["created"] = pd.to_datetime(test_data.created)
        
        # extracting month
        train_data["created_month"] = train_data.created.dt.month
        test_data["created_month"] = test_data.created.dt.month
        
        # extracting day
        train_data["created_day"] = train_data.created.dt.day
        test_data["created_day"] = test_data.created.dt.day
        
        # extracting hour
        train_data["created_hour"] = train_data.created.dt.hour
        test_data["created_hour"] = test_data.created.dt.hour
        
        # extracting total_days
        train_data["total_days"] = (train_data["created_month"] - 4.0) * 30 + train_data["created_day"] + train_data["created_hour"] / 25.0
        test_data["total_days"] = (test_data["created_month"] - 4.0) * 30 + test_data["created_day"] + test_data["created_hour"] / 25.0
        
        # number of words in the description
        train_data["num_description_words"] = train_data.description.apply(lambda x: len(x.split(" ")))
        test_data["num_description_words"] = test_data.description.apply(lambda x: len(x.split(" ")))
        
        # do capital words in description have a negative effect
        train_data['num_cap_share'] = train_data['description'].apply(RentalListing.cap_share)
        test_data['num_cap_share'] = test_data['description'].apply(RentalListing.cap_share)
        
        # number of description lines
        train_data['no_lines_desc'] = train_data['description'].apply(lambda x: x.count('<br /><br />'))
        test_data['no_lines_desc'] = test_data['description'].apply(lambda x: x.count('<br /><br />'))
        
        # distance from Cental Park
        lat_lon_frames_train = train_data[['latitude','longitude']].copy()
        temp_dist1 = lat_lon_frames_train.apply(RentalListing.distance_park, axis = 1)
        train_data['distance_park'] = temp_dist1
        
        lat_lon_frames_test = test_data[['latitude','longitude']].copy()
        temp_dist2 = lat_lon_frames_test.apply(RentalListing.distance_park, axis = 1)
        test_data['distance_park'] = temp_dist2
        
        # distance from City Ccenter
        temp_dist3 = lat_lon_frames_train.apply(RentalListing.distance_city, axis = 1)
        train_data['distance_center'] = temp_dist3
        
        temp_dist4 = lat_lon_frames_test.apply(RentalListing.distance_city, axis = 1)
        test_data['distance_center'] = temp_dist4
        
        # count of manager id's
        manager_id_count_train = train_data.manager_id.value_counts()
        train_data["manager_count"] = list(map(lambda x: manager_id_count_train[x], train_data["manager_id"]))
        
        manager_id_count_test = test_data.manager_id.value_counts()
        test_data["manager_count"] = list(map(lambda x: manager_id_count_test[x], test_data["manager_id"]))
        
        # number of listings with same address
        street_count_train = train_data["street_address"].value_counts() 
        train_data["street_count"] = list(map(lambda x:street_count_train[x], train_data["street_address"]))
        
        street_count_test = test_data["street_address"].value_counts() 
        test_data["street_count"] = list(map(lambda x:street_count_test[x], test_data["street_address"]))
    
        # taking building_count as a feature
        building_id_count_train = train_data.building_id.value_counts()
        train_data["building_count"] = list(map(lambda x:building_id_count_train[x], train_data["building_id"]))
        
        building_id_count_test = test_data.building_id.value_counts()
        test_data["building_count"] = list(map(lambda x:building_id_count_test[x], test_data["building_id"]))
        
        # price variable column for k means clustering
        train_data["price_var"] = train_data.price
        test_data["price_var"] = test_data.price
    
        # price average on latitude
        train_data["price_latitude"] = train_data["price"] / (train_data["latitude"] + 1.0)
        test_data["price_latitude"] = test_data["price"] / (test_data["latitude"] + 1.0)
        
        # rank difference
        train_data["diff_rank"] = train_data["total_days"] / (train_data["listing_id"] + 1)
        test_data["diff_rank"] = test_data["total_days"] / (test_data["listing_id"] + 1)
        
        X_vars = ['bathrooms', 'total_rooms', 'latitude', 'longitude', 'price', 'num_photos', 
                  'num_features', 
                  'price_bed', 'created_day', 'total_days', 'num_description_words', 'no_lines_desc', 
                  'description', 'num_cap_share',
                  'distance_park', 'distance_center', 'price_latitude', 'diff_rank', 'manager_count', 
                  'street_count', 'building_count', 'manager_id', 'features', 'price_var']

        self.train_data = train_data[X_vars]
        self.test_data = test_data[X_vars]
        
    def addManagerSkill(self):
        # computing manager skill 
        temp = pd.concat([self.train_data["manager_id"], pd.get_dummies(self.train_labels)], axis = 1).groupby("manager_id").mean()
        temp.rename(columns = {"high": "manager_id_high_frac", "medium": "manager_id_medium_frac", "low": "manager_id_low_frac"}, inplace = True)
        temp['manager_id_count'] = self.train_data.groupby("manager_id").count().iloc[:, 1]
        temp['manager_id_skill'] = temp['manager_id_high_frac'] * 2 + temp['manager_id_medium_frac']
        mean = temp.loc[temp['manager_id_count'] >= 6, 'manager_id_skill'].mean()
        temp.loc[temp['manager_id_count'] < 6, 'manager_id_skill'] = mean
        
        # adding for training data
        self.train_data = pd.merge(left = self.train_data, right = temp, how = 'left', left_on = "manager_id", right_index = True)
        self.train_data["manager_id_skill"].fillna(mean, inplace = True)
        
        # adding for testing data
        self.test_data = pd.merge(left = self.test_data, right = temp, how = 'left', left_on = "manager_id", right_index = True)
        self.test_data["manager_id_skill"].fillna(mean, inplace = True)
        
        # dropping un-necessary columns
        self.train_data.drop(columns = ['manager_id', 'manager_id_count', 'manager_id_high_frac', 
                                'manager_id_medium_frac', 'manager_id_low_frac'], inplace = True)
        self.test_data.drop(columns = ['manager_id', 'manager_id_count', 'manager_id_high_frac', 
                                'manager_id_medium_frac', 'manager_id_low_frac'], inplace = True)
    
    # preprocessing for feature column
    def cleanupListingFeatures(x):
        x = x.replace('dryer_in_building', 'dryer')
        x = x.replace('dryer_in_unit', 'dryer')
        x = x.replace('_dryer', 'dryer')
        
        x = x.replace('elevator_bldg_', 'elevator')
        x = x.replace('_elev', 'elevator')
        
        x = x.replace('gym_in_building', 'gym')
        x = x.replace('fitness_center', 'gym')
        x = x.replace('fitness', 'gym')
        x = x.replace('health_club', 'gym')
        
        x = x.replace('hardwood_floors', 'hfloors')
        x = x.replace('hardwood', 'hfloors')
        
        x = x.replace('high_ceilings', 'high_ceiling')
        
        x = x.replace('in_kitchen_', 'in_kitchen')
        
        x = x.replace('site_laundry', 'laundry')
        x = x.replace('laundry_in_building', 'laundry')
        x = x.replace('laundry_in_unit', 'laundry')
        x = x.replace('laundry_on_floor', 'laundry')
        x = x.replace('laundry_room', 'laundry')
        x = x.replace('_lndry_bldg_', 'laundry')
        x = x.replace('lndry_bldg_', 'laundry')
        x = x.replace('laundry_', 'laundry')
        
        x = x.replace('fireplaces', 'fireplace')
        
        x = x.replace('high_speed_internet', 'internet')
        x = x.replace('wifi_access', 'internet')
        x = x.replace('wifi', 'internet')
        x = x.replace('speed_internet', 'internet')
        
        x = x.replace('high_rise', 'hrise')
        x = x.replace('highrise', 'hrise')
        x = x.replace('hi_rise', 'hrise')
        x = x.replace('high', 'hrise ')
        
        x = x.replace('_washer', 'washer')
        x = x.replace('_dishwasher', 'washer')
        x = x.replace('dishwasher', 'washer')
        x = x.replace('unit_washer', 'washer')
        x = x.replace('washer_in_unit', 'washer')
        x = x.replace('washer_', 'washer')
        
        x = x.replace('wheelchair_ramp', 'wheelchair_access')
        
        x = x.replace('outdoor_roof_overlookingnew_constructionyork_harbor_and_battery_park', 'roof')
        x = x.replace('roof_deck', 'roof')
        x = x.replace('roofdeck', 'roof')
        x = x.replace('rooftop_deck', 'roof')
        x = x.replace('rooftop_terrace', 'roof')
        x = x.replace('common_terrace', 'roof')
        x = x.replace('common_roof_deck', 'roof')
        x = x.replace('terraces_', 'roof')
        x = x.replace('terrace', 'roof')
        
        x = x.replace('cats_allowed', 'pet_ok')
        x = x.replace('_pets_ok_', 'pet_ok')
        x = x.replace('cats_allowed', 'pet_ok')
        x = x.replace('dogs_allowed', 'pet_ok')
        x = x.replace('pet_friendly', 'pet_ok')
        x = x.replace('pets_allowed', 'pet_ok')
        x = x.replace('pets_on_approval', 'pet_ok')
        x = x.replace('pets', 'pet_ok ')
        
        x = x.replace('newly_renovated', 'new_construction')
        x = x.replace('_new_', 'new_construction')
        
        x = x.replace('central_ac', 'ac')
        x = x.replace('central_air', 'ac')
        x = x.replace('air_conditioning', 'ac')
        x = x.replace('central_a', 'ac')
        
        x = x.replace('parking_available', 'parking')
        x = x.replace('parking_space', 'parking')
        x = x.replace('site_parking_lot', 'parking')
        x = x.replace('site_parking', 'parking')
        x = x.replace('parking_lot', 'parking')
        x = x.replace('common_parking', 'parking')
        
        x = x.replace('7_doorman', 'doorman')
        x = x.replace('time_doorman', 'doorman')
        x = x.replace('ft_doorman', 'doorman')
        
        x = x.replace('7_concierge', 'concierge')
        x = x.replace('concierge_service', 'concierge')
        
        x = x.replace('site_garage', 'garage')
        x = x.replace('full_service_garage', 'garage')
        
        x = x.replace('residents_garden', 'garden')
        x = x.replace('common_garden', 'garden')
    
        x = x.replace('indoor_pool', 'pool')
        x = x.replace('swimming_pool', 'pool')
        x = x.replace('outdoor_pool', 'pool')
        
        x = x.replace('post_war', 'post')
        
        x = x.replace('prewar', 'pre')
        
        x = x.replace('valet_services_including_dry_cleaning', 'valet')
        x = x.replace('valet_services', 'valet')
        x = x.replace('valet_parking', 'valet')
    
        return x

    def addListingFeatures(self):
        feature_transform = CountVectorizer(stop_words = 'english', max_features = 50)
        
        features_column_train = self.train_data["features"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))
        features_column_test = self.test_data["features"].apply(lambda x: " ".join(["_".join(i.lower().split(" ")) for i in x]))
        
        features_column_train = features_column_train.apply(RentalListing.cleanupListingFeatures)
        features_column_test = features_column_test.apply(RentalListing.cleanupListingFeatures)
        
        # fitting the count vectorizer
        feature_transform.fit(list(features_column_train) + list(features_column_test))
        
        # one hot encoding for the train data
        feat_sparse1 = feature_transform.transform(features_column_train)
        vocabulary1 = feature_transform.vocabulary_
        temp1 = pd.DataFrame([pd.Series(feat_sparse1[i].toarray().ravel()) for i in np.arange(feat_sparse1.shape[0])])
        temp1.columns = list(sorted(vocabulary1.keys()))
        self.train_data = pd.concat([self.train_data.reset_index(), temp1.reset_index()], axis = 1)
        self.train_data.drop(columns = ['index', 'features'], inplace = True)

        # one hot encoding for the test data
        feat_sparse2 = feature_transform.transform(features_column_test)
        vocabulary2 = feature_transform.vocabulary_
        temp2 = pd.DataFrame([pd.Series(feat_sparse2[i].toarray().ravel()) for i in np.arange(feat_sparse2.shape[0])])
        temp2.columns = list(sorted(vocabulary2.keys()))
        self.test_data = pd.concat([self.test_data.reset_index(), temp2.reset_index()], axis = 1)
        self.test_data.drop(columns = ['index', 'features'], inplace = True)
    
    def cleanupListingDescription(x):
        stemmer = PorterStemmer()
        
        # keeping only alphabets and spaces
        regex = re.compile('[^a-zA-Z ]')
        i = regex.sub(' ', x).lower()
        i = i.split(" ") 
        i = [stemmer.stem(l) for l in i]
        # Keeping words that have length greater than 2
        i = " ".join([l.strip() for l in i if (len(l) > 2) ]) 
        return i
        
    def addListingDescription(self):        
        # basic cleanup on the description column
        train_desc_new = self.train_data.description.apply(lambda x: RentalListing.cleanupListingDescription(x))
        test_desc_new = self.test_data.description.apply(lambda x: RentalListing.cleanupListingDescription(x))
        
        # fit the count vectorizer
        desc_transform = CountVectorizer(stop_words = 'english', max_features = 25)
        desc_transform.fit(list(train_desc_new) + list(test_desc_new))
        
        # fit for training data
        desc_sparse1 = desc_transform.transform(train_desc_new)
        vocabulary1 = desc_transform.vocabulary_
        temp1 = pd.DataFrame([pd.Series(desc_sparse1[i].toarray().ravel()) for i in np.arange(desc_sparse1.shape[0])])
        temp1.columns = list(sorted(vocabulary1.keys()))
        self.train_data = pd.concat([self.train_data.reset_index(), temp1.reset_index()], axis = 1)
        self.train_data.drop(columns=['index', 'description'], inplace = True)
        
        # fit for testing data
        desc_sparse2 = desc_transform.transform(test_desc_new)
        vocabulary2 = desc_transform.vocabulary_
        temp2 = pd.DataFrame([pd.Series(desc_sparse2[i].toarray().ravel()) for i in np.arange(desc_sparse2.shape[0])])
        temp2.columns = list(sorted(vocabulary2.keys()))
        self.test_data = pd.concat([self.test_data.reset_index(), temp2.reset_index()], axis = 1)
        self.test_data.drop(columns=['index', 'description'], inplace = True)
        
    def addClusterLabels(self):
        X = pd.DataFrame()
        X['lat'] = self.train_data.latitude
        X['long'] = self.train_data.longitude
        kmeans = KMeans(n_clusters = 60, random_state = 2301, n_init = 60, max_iter = 500).fit(X)
        
        # adding cluster labels for training data
        self.train_data['label'] = kmeans.labels_
        
        # inner function
        def assignLabel(x):
            lat = x.latitude
            lon = x.longitude
            current = (lat, lon)
            min_val = 999999
            result = -1
            label = 0
            for c in kmeans.cluster_centers_:
                temp = distance.euclidean(c, current)
                if (temp < min_val):
                    min_val = temp
                    result = label
                label = label + 1
            return result     
        
        self.test_data['label'] = self.test_data.apply((lambda x: assignLabel(x)), axis = 1)
        
    def addPriceVariance(self):
        # variance addition
        temp_label = self.train_data.groupby(['label'])['price'].median()

        # for training data
        for i, mean in zip(range(60), temp_label):
            cluster = np.where(self.train_data['label'] == i)[0]
            self.train_data.price_var[cluster] -= mean

        # for testing data
        for i, mean in zip(range(60), temp_label):
            cluster = np.where(self.test_data['label'] == i)[0]
            self.test_data.price_var[cluster] -= mean
        
    def removeDupNames(self):
        df_cols = self.train_data.columns 
    
        seen = set()
        un_list = []
        for item in df_cols:
            fudge = 1
            newitem = item
            while newitem in seen:
                fudge += 1
                newitem = "{}_{}".format(item, fudge)
            un_list = un_list + [newitem]
            seen.add(newitem)

        # df_cols has unique names
        # assigning them back
        self.train_data.columns = un_list
        self.test_data.columns = un_list
        
    def featureEngineering(self):
        # basic feature engineering
        self.basicFeatureEngineering()
        
        # adding manager skill
        self.addManagerSkill()
        
        # adding feature columns
        self.addListingFeatures()
        
        # adding description columns
        self.addListingDescription()
        
        # adding k means stuff
        self.addClusterLabels()
        
        # add price variance based on the cluster labels
        self.addPriceVariance()
        
        # rename duplicate columns
        self.removeDupNames()
    
    # lightGBM stuff
    def trainLightGBM(self):
        self.__lgb.fit(self.train_data, self.train_labels)
        
    def printTestResultsLightGBM(self):
        # make predictions - for training set
        self.train_labels_pred = self.__lgb.predict(self.train_data)
        self.train_labels_pred_prob = self.__lgb.predict_proba(self.train_data)
        
        # print for train set
        print ("Model Report - Training Data")
        print ("Accuracy: " + str(metrics.accuracy_score(self.train_labels, self.train_labels_pred) * 100.0))        
        print ("Log Loss: " + str(metrics.log_loss(self.train_labels, self.train_labels_pred_prob)))
        print ("F-Score: " + str(metrics.f1_score(self.train_labels, self.train_labels_pred, average = "macro") * 100.0))
        
        # print for test set if labels provided
        if (self.test_labels is None):
            print ("\nTest Data - Labels Missing")
            return
        
        # make predictions - for test set
        self.test_labels_pred = self.__lgb.predict(self.test_data)
        self.test_labels_pred_prob = self.__lgb.predict_proba(self.test_data)
        
        # print for test set
        print ("\nModel Report - Test Data")
        print ("Accuracy: " + str(metrics.accuracy_score(self.test_labels, self.test_labels_pred) * 100.0))        
        print ("Log Loss: " + str(metrics.log_loss(self.test_labels, self.test_labels_pred_prob)))
        print ("F-Score: " + str(metrics.f1_score(self.test_labels, self.test_labels_pred, average = "macro") * 100.0))
        
    def getSubmissionFileLightGBM(self):
        self.test_labels_pred_prob = self.__lgb.predict_proba(self.test_data)
        y = self.test_labels_pred_prob
        y_df = pd.DataFrame()
        y_df["listing_id"] = self.test_listing_id
        labels2idx = {label: i for i, label in enumerate(self.__lgb.classes_)}
        for label in ["high", "medium", "low"]:
            y_df[label] = y[:, labels2idx[label]]
        y_df.to_csv("submission_light_gbm.csv", index = False)
    
    def getPickleFileLightGBM(self):
        pickle_out = open("model_lgb.pickle","wb")
        pickle.dump(self.__lgb, pickle_out)
        pickle_out.close()
    
    # Gradient Boosting stuff
    def trainGBM(self):
        self.__gbm.fit(self.train_data, self.train_labels)
        
    def printTestResultsGBM(self):
        # make predictions - for training set
        self.train_labels_pred = self.__gbm.predict(self.train_data)
        self.train_labels_pred_prob = self.__gbm.predict_proba(self.train_data)
        
        # print for train set
        print ("Model Report - Training Data")
        print ("Accuracy: " + str(metrics.accuracy_score(self.train_labels, self.train_labels_pred) * 100.0))        
        print ("Log Loss: " + str(metrics.log_loss(self.train_labels, self.train_labels_pred_prob)))
        print ("F-Score: " + str(metrics.f1_score(self.train_labels, self.train_labels_pred, average = "macro") * 100.0))
        
        # print for test set if labels provided
        if (self.test_labels is None):
            print ("\nTest Data - Labels Missing")
            return
        
        # make predictions - for test set
        self.test_labels_pred = self.__gbm.predict(self.test_data)
        self.test_labels_pred_prob = self.__gbm.predict_proba(self.test_data)
        
        # print for test set
        print ("\nModel Report - Test Data")
        print ("Accuracy: " + str(metrics.accuracy_score(self.test_labels, self.test_labels_pred) * 100.0))        
        print ("Log Loss: " + str(metrics.log_loss(self.test_labels, self.test_labels_pred_prob)))
        print ("F-Score: " + str(metrics.f1_score(self.test_labels, self.test_labels_pred, average = "macro") * 100.0))
        
    def getSubmissionFileGBM(self):
        self.test_labels_pred_prob = self.__gbm.predict_proba(self.test_data)
        y = self.test_labels_pred_prob
        y_df = pd.DataFrame()
        y_df["listing_id"] = self.test_listing_id
        labels2idx = {label: i for i, label in enumerate(self.__gbm.classes_)}
        for label in ["high", "medium", "low"]:
            y_df[label] = y[:, labels2idx[label]]
        y_df.to_csv("submission_gbm.csv", index = False)
    
    def getPickleFileGBM(self):
        pickle_out = open("model_gbm.pickle","wb")
        pickle.dump(self.__gbm, pickle_out)
        pickle_out.close()
        
    # xgboost stuff
    def trainXGB(self):
        self.__xgb.fit(self.train_data, self.train_labels)
        
    def printTestResultsXGB(self):
        # make predictions - for training set
        self.train_labels_pred = self.__xgb.predict(self.train_data)
        self.train_labels_pred_prob = self.__xgb.predict_proba(self.train_data)
        
        # print for train set
        print ("Model Report - Training Data")
        print ("Accuracy: " + str(metrics.accuracy_score(self.train_labels, self.train_labels_pred) * 100.0))        
        print ("Log Loss: " + str(metrics.log_loss(self.train_labels, self.train_labels_pred_prob)))
        print ("F-Score: " + str(metrics.f1_score(self.train_labels, self.train_labels_pred, average = "macro") * 100.0))
        
        # print for test set if labels provided
        if (self.test_labels is None):
            print ("\nTest Data - Labels Missing")
            return
        
        # make predictions - for test set
        self.test_labels_pred = self.__xgb.predict(self.test_data)
        self.test_labels_pred_prob = self.__xgb.predict_proba(self.test_data)
        
        # print for test set
        print ("\nModel Report - Test Data")
        print ("Accuracy: " + str(metrics.accuracy_score(self.test_labels, self.test_labels_pred) * 100.0))        
        print ("Log Loss: " + str(metrics.log_loss(self.test_labels, self.test_labels_pred_prob)))
        print ("F-Score: " + str(metrics.f1_score(self.test_labels, self.test_labels_pred, average = "macro") * 100.0))
        
    def getSubmissionFileXGB(self):
        self.test_labels_pred_prob = self.__xgb.predict_proba(self.test_data)
        y = self.test_labels_pred_prob
        y_df = pd.DataFrame()
        y_df["listing_id"] = self.test_listing_id
        labels2idx = {label: i for i, label in enumerate(self.__xgb.classes_)}
        for label in ["high", "medium", "low"]:
            y_df[label] = y[:, labels2idx[label]]
        y_df.to_csv("submission_xgb.csv", index = False)
    
    def getPickleFileXGB(self):
        pickle_out = open("model_xgb.pickle","wb")
        pickle.dump(self.__xgb, pickle_out)
        pickle_out.close()
        
    # random forest stuff
    def trainRFC(self):
        self.__rfc.fit(self.train_data, self.train_labels)
        
    def printTestResultsRFC(self):
        # make predictions - for training set
        self.train_labels_pred = self.__rfc.predict(self.train_data)
        self.train_labels_pred_prob = self.__rfc.predict_proba(self.train_data)
        
        # print for train set
        print ("Model Report - Training Data")
        print ("Accuracy: " + str(metrics.accuracy_score(self.train_labels, self.train_labels_pred) * 100.0))        
        print ("Log Loss: " + str(metrics.log_loss(self.train_labels, self.train_labels_pred_prob)))
        print ("F-Score: " + str(metrics.f1_score(self.train_labels, self.train_labels_pred, average = "macro") * 100.0))
        
        # print for test set if labels provided
        if (self.test_labels is None):
            print ("\nTest Data - Labels Missing")
            return
        
        # make predictions - for test set
        self.test_labels_pred = self.__rfc.predict(self.test_data)
        self.test_labels_pred_prob = self.__rfc.predict_proba(self.test_data)
        
        # print for test set
        print ("\nModel Report - Test Data")
        print ("Accuracy: " + str(metrics.accuracy_score(self.test_labels, self.test_labels_pred) * 100.0))        
        print ("Log Loss: " + str(metrics.log_loss(self.test_labels, self.test_labels_pred_prob)))
        print ("F-Score: " + str(metrics.f1_score(self.test_labels, self.test_labels_pred, average = "macro") * 100.0))
        
    def getSubmissionFileRFC(self):
        self.test_labels_pred_prob = self.__rfc.predict_proba(self.test_data)
        y = self.test_labels_pred_prob
        y_df = pd.DataFrame()
        y_df["listing_id"] = self.test_listing_id
        labels2idx = {label: i for i, label in enumerate(self.__rfc.classes_)}
        for label in ["high", "medium", "low"]:
            y_df[label] = y[:, labels2idx[label]]
        y_df.to_csv("submission_rfc.csv", index = False)
    
    def getPickleFileRFC(self):
        pickle_out = open("model_rfc.pickle","wb")
        pickle.dump(self.__rfc, pickle_out)
        pickle_out.close()
        
    def print_test(self):
        print (self.train_data.columns)
        print ("------XXXXXXX------")
        print (self.test_data.columns)

In [4]:
# input's
train_data_file = "../input/train.json"
test_data_file = "../input/test.json"
# test_data_file = None

# getting data ready - feature engineering
model = RentalListing(train_data_file, test_data_file)
model.read_data()
model.featureEngineering()

TypeError: unbound method cap_share() must be called with RentalListing instance as first argument (got unicode instance instead)

In [None]:
# model 1 - light gbm
model.trainLightGBM()
model.printTestResultsLightGBM()
model.getSubmissionFileLightGBM()
model.getPickleFileLightGBM()

In [None]:
# model 2 - Gradient Boosting Classifier
model.trainGBM()
model.printTestResultsGBM()
model.getSubmissionFileGBM()
model.getPickleFileGBM()

In [None]:
# model 3 - xgboost
model.trainXGB()
model.printTestResultsXGB()
model.getSubmissionFileXGB()
model.getPickleFileXGB()

In [None]:
# model 4 - random forest
model.trainRFC()
model.printTestResultsRFC()
model.getSubmissionFileRFC()
model.getPickleFileRFC()