In [1]:
import time; start_time = time.time()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import log_loss
from sklearn import pipeline
import pandas as pd
import numpy as np
from nltk.stem.porter import *
stemmer = PorterStemmer()
#from bs4 import BeautifulSoup
import random; random.seed(7)
import xgboost as xgb
import datetime as dt

train = pd.read_json(open("../input/train.json", "r"))
y = train.interest_level.values
n = len(train)

test = pd.read_json(open("../input/test.json", "r"))
listing_id = test.listing_id.values

col = [x for x in train.columns if x not in ['listing_id','interest_level','street_address']]
print(col)
print(len(train),len(test))

def str_stem(s): 
    if isinstance(s, str):
        s = s.lower()
        s = s.replace("  "," ")
        #b = BeautifulSoup(s, "lxml")
        s = b.get_text(" ").strip()
        s = (" ").join([z for z in s.split(" ")])
        s = (" ").join([stemmer.stem(z) for z in s.split(" ")])
        s = s.lower().strip()
        return s
    else:
        return ""

class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, df):
        d_col_drops=['xdescription', 'ydescription']
        df = df.drop(d_col_drops, axis=1).values
        return df

class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key].apply(str)
    
df_all = pd.concat((train[col], test[col]), axis=0, ignore_index=True)
train = []
test = []

df_all['photos'] = df_all.photos.apply(len)

df_all["price_be"] = df_all["price"]/df_all["bedrooms"]
df_all["price_ba"] = df_all["price"]/df_all["bathrooms"]

df_all["created"] = pd.to_datetime(df_all["created"])
df_all["created_year"] = df_all["created"].dt.year
df_all["created_month"] = df_all["created"].dt.month
df_all["created_day"] = df_all["created"].dt.day
df_all['created_hour'] = df_all["created"].dt.hour
df_all['created_weekday'] = df_all['created'].dt.weekday
df_all['created_week'] = df_all['created'].dt.week
df_all['created_quarter'] = df_all['created'].dt.quarter
df_all['created_weekend'] = ((df_all['created_weekday'] == 5) & (df_all['created_weekday'] == 6))
df_all['created_wd'] = ((df_all['created_weekday'] != 5) & (df_all['created_weekday'] != 6))
df_all['created'] = df_all['created'].map(lambda x: float((x - dt.datetime(1899, 12, 30)).days) + (float((x - dt.datetime(1899, 12, 30)).seconds) / 86400))

df_all['x5'] = df_all['latitude'].map(lambda x : round(x,5))
df_all['y5'] = df_all['longitude'].map(lambda x : round(x,5))
df_all['x4'] = df_all['latitude'].map(lambda x : round(x,4))
df_all['y4'] = df_all['longitude'].map(lambda x : round(x,4))
df_all['x3'] = df_all['latitude'].map(lambda x : round(x,3))
df_all['y3'] = df_all['longitude'].map(lambda x : round(x,3))
df_all['x2'] = df_all['latitude'].map(lambda x : round(x,2))
df_all['y2'] = df_all['longitude'].map(lambda x : round(x,2))

dummies = df_all['features'].str.join(sep=',').str.lower().str.get_dummies(sep=',')
df_all = pd.concat([df_all, dummies], axis=1)
dummies = []
df_all['features'] = df_all.features.apply(len)

cat = ['building_id',  'description', 'display_address', 'manager_id']
lbl = preprocessing.LabelEncoder()
for c in cat:
    if c in ['description']:
        df_all['x'+c] = df_all[c].map(lambda x:str_stem(x))
        df_all['y'+c] = df_all[c].values
    df_all['words_of_'+c] = df_all[c].map(lambda x:len(x.strip().split(' ')))
    df_all['len_of_'+c] = df_all[c].map(lambda x:len(x.strip()))
    df_all[c] = lbl.fit_transform(list(df_all[c].values))
    print(c, len(lbl.classes_))

train = df_all.iloc[:n]
test = df_all.iloc[n:]
#df_all = []

tfidf = TfidfVectorizer(stop_words ='english', max_df=0.9)
tsvd = TruncatedSVD(n_components=25, random_state = 7)
clf = pipeline.Pipeline([
        ('union', FeatureUnion(
                    transformer_list = [
                        ('cst',  cust_regression_vals()),
                        ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='xdescription')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
                        ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='ydescription')), ('tfidf2', tfidf), ('tsvd2', tsvd)]))
                        ],
                    transformer_weights = {
                        'cst': 1.0,
                        'txt1': 1.0,
                        'txt2': 1.0
                        },
                n_jobs = -1
                ))])

y_val = lbl.fit_transform(y)
xtrain = pd.DataFrame(clf.fit_transform(train)).apply(pd.to_numeric)
xtrain = xgb.DMatrix(xtrain.values, y_val)
xtest = pd.DataFrame(clf.transform(test)).apply(pd.to_numeric)
xtest = xgb.DMatrix(xtest.values)

param = {}
param['objective'] = 'multi:softprob'
param['eta'] = 0.1
#param['max_depth'] = 4
param['silent'] = True
param['num_class'] = 3
param['eval_metric'] = "mlogloss"
param['min_child_weight'] = 1
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['seed'] = 7
plst = list(param.items())
nfolds = 5
nrounds = 100

model = xgb.cv(plst, xtrain, nrounds, nfolds, early_stopping_rounds=20, verbose_eval=25)
best_rounds = np.argmin(model['test-mlogloss-mean'])
model = xgb.train(plst, xtrain, best_rounds)
print(log_loss(y_val, model.predict(xtrain)))
preds = model.predict(xtest)
out_df = pd.DataFrame(preds)
out_df.columns = lbl.inverse_transform(out_df.columns)
out_df["listing_id"] = listing_id
out_df.to_csv("z09submission01.csv", index=False)
print('Done...',(time.time()-start_time)/60)

[u'bathrooms', u'bedrooms', u'building_id', u'created', u'description', u'display_address', u'features', u'latitude', u'longitude', u'manager_id', u'photos', u'price']
(49352, 74659)
('building_id', 11635)


MemoryError: 