In [2]:
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import CountVectorizer

train_df = pd.read_json("../input/train.json")
test_df = pd.read_json("../input/test.json")

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=2000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20,verbose_eval=50)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

import lightgbm as lgbm
def runlgbm(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=500000,e_stoping_r=50): 
    t4_params = {
        'boosting_type': 'gbdt', 'objective': 'multiclass', 'nthread': -1, 'silent': True,
        'num_leaves': 6, 'learning_rate': 0.03, 'max_depth': 6,
        'max_bin': 255, 'subsample_for_bin': 50000,
        'subsample': 0.6, 'subsample_freq': 1, 'colsample_bytree': 0.6, 'reg_alpha':1, 'reg_lambda':0,
        'min_split_gain': 0.5, 'min_child_weight': 1, 'min_child_samples': 10, 'scale_pos_weight': 1}

    # they can be used directly to build a LGBMClassifier (which is wrapped in a sklearn fashion)
    model = lgbm.sklearn.LGBMClassifier(n_estimators=num_rounds, seed=0, **t4_params)
    
    if test_y is not None:
        model.fit(train_X,train_y,eval_set=[(train_X,train_y),(test_X, test_y)],verbose=100,early_stopping_rounds=e_stoping_r)
    else:
        model.fit(train_X,train_y)
    pred_test_y = model.predict_proba(test_X)
    return pred_test_y, model


test_df["bathrooms"].loc[19671] = 1.5
test_df["bathrooms"].loc[22977] = 2.0
test_df["bathrooms"].loc[63719] = 2.0
train_df["price"] = train_df["price"].clip(upper=13000)

train_df["logprice"] = np.log(train_df["price"])
test_df["logprice"] = np.log(test_df["price"])

train_df["price_t"] =train_df["price"]/train_df["bedrooms"]
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 

train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 

train_df['price_per_room'] = train_df['price']/train_df['room_sum']
test_df['price_per_room'] = test_df['price']/test_df['room_sum']

train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour


train_df["pos"] = train_df.longitude.round(3).astype(str) + '_' + train_df.latitude.round(3).astype(str)
test_df["pos"] = test_df.longitude.round(3).astype(str) + '_' + test_df.latitude.round(3).astype(str)

vals = train_df['pos'].value_counts()
dvals = vals.to_dict()
train_df["density"] = train_df['pos'].apply(lambda x: dvals.get(x, vals.min()))
test_df["density"] = test_df['pos'].apply(lambda x: dvals.get(x, vals.min()))

features_to_use=["bathrooms", "bedrooms", "latitude", "longitude", "price","price_t","price_per_room", "logprice", "density",
"num_photos", "num_features", "num_description_words","listing_id", "created_year", "created_month", "created_day", "created_hour"]

index=list(range(train_df.shape[0]))
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
    building_level={}
    for j in train_df['manager_id'].values:
        building_level[j]=[0,0,0]
    
    test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
    train_index=list(set(index).difference(test_index))
    
    for j in train_index:
        temp=train_df.iloc[j]
        if temp['interest_level']=='low':
            building_level[temp['manager_id']][0]+=1
        if temp['interest_level']=='medium':
            building_level[temp['manager_id']][1]+=1
        if temp['interest_level']=='high':
            building_level[temp['manager_id']][2]+=1
            
    for j in test_index:
        temp=train_df.iloc[j]
        if sum(building_level[temp['manager_id']])!=0:
            a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
            b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
            c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])
            
train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c

a=[]
b=[]
c=[]
building_level={}
for j in train_df['manager_id'].values:
    building_level[j]=[0,0,0]

for j in range(train_df.shape[0]):
    temp=train_df.iloc[j]
    if temp['interest_level']=='low':
        building_level[temp['manager_id']][0]+=1
    if temp['interest_level']=='medium':
        building_level[temp['manager_id']][1]+=1
    if temp['interest_level']=='high':
        building_level[temp['manager_id']][2]+=1

for i in test_df['manager_id'].values:
    if i not in building_level.keys():
        a.append(np.nan)
        b.append(np.nan)
        c.append(np.nan)
    else:
        a.append(building_level[i][0]*1.0/sum(building_level[i]))
        b.append(building_level[i][1]*1.0/sum(building_level[i]))
        c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df['manager_level_low']=a
test_df['manager_level_medium']=b
test_df['manager_level_high']=c

features_to_use.append('manager_level_low') 
features_to_use.append('manager_level_medium') 
features_to_use.append('manager_level_high')

categorical = ["display_address", "manager_id", "building_id"]
for f in categorical:
        if train_df[f].dtype=='object':
            lbl = LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))

tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

In [3]:
import math
def cart2rho(x, y):
    rho = np.sqrt(x**2 + y**2)
    return rho


def cart2phi(x, y):
    phi = np.arctan2(y, x)
    return phi


def rotation_x(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return x*math.cos(alpha) + y*math.sin(alpha)


def rotation_y(row, alpha):
    x = row['latitude']
    y = row['longitude']
    return y*math.cos(alpha) - x*math.sin(alpha)


def add_rotation(degrees, df):
    namex = "rot" + str(degrees) + "_X"
    namey = "rot" + str(degrees) + "_Y"

    df['num_' + namex] = df.apply(lambda row: rotation_x(row, math.pi/(180/degrees)), axis=1)
    df['num_' + namey] = df.apply(lambda row: rotation_y(row, math.pi/(180/degrees)), axis=1)

    return df

def operate_on_coordinates(tr_df, te_df):
    for df in [tr_df, te_df]:
        #polar coordinates system
        df["num_rho"] = df.apply(lambda x: cart2rho(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        df["num_phi"] = df.apply(lambda x: cart2phi(x["latitude"] - 40.78222222, x["longitude"]+73.96527777), axis=1)
        #rotations
        for angle in [15,30,45,60]:
            df = add_rotation(angle, df)

    return tr_df, te_df

allcols=train_df.columns.tolist()
train_df, test_df = operate_on_coordinates(train_df, test_df)

import re

def cap_share(x):
    return sum(1 for c in x if c.isupper())/float(len(x)+1)

for df in [train_df, test_df]:
    # do you think that users might feel annoyed BY A DESCRIPTION THAT IS SHOUTING AT THEM?
    df['num_cap_share'] = df['description'].apply(cap_share)
    
    # how long in lines the desc is?
    df['num_nr_of_lines'] = df['description'].apply(lambda x: x.count('<br /><br />'))
   
    # is the description redacted by the website?        
    df['num_redacted'] = 0
    df['num_redacted'].ix[df['description'].str.contains('website_redacted')] = 1

    
    # can we contact someone via e-mail to ask for the details?
    df['num_email'] = 0
    df['num_email'].ix[df['description'].str.contains('@')] = 1
    
    #and... can we call them?
    
    reg = re.compile(".*?(\(?\d{3}\D{0,3}\d{3}\D{0,3}\d{4}).*?", re.S)
    def try_and_find_nr(description):
        if reg.match(description) is None:
            return 0
        return 1

    df['num_phone_nr'] = df['description'].apply(try_and_find_nr)

gpdand_desnewfeaure=[col for col in train_df.columns if col not in allcols]
features_to_use.extend(gpdand_desnewfeaure)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [4]:
import s2sphere
lat = train_df.latitude.values.tolist()
lon = train_df.longitude.values.tolist()

cellId1 = []
cellId2 = []
cellId3 = []
for i in range(0,len(lat)):
    p1 = s2sphere.LatLng.from_degrees(lat[i], lon[i])
    cell = s2sphere.CellId.from_lat_lng(p1)
    cid = str(cell.id())
    #print(cid)
    ##cid is a 19 digit number so python storing it as Object, not number
    ##So I am converting it into 3 numbers
    cellId1.append(int(cid[:6]))
    cellId2.append(int(cid[6:12]))
    cellId3.append(int(cid[12:19]))
    

se = pd.Series(cellId1)
train_df['cellId1'] = se.values

se = pd.Series(cellId2)
train_df['cellId2'] = se.values

se = pd.Series(cellId3)
train_df['cellId3'] = se.values

lat = test_df.latitude.values.tolist()
lon = test_df.longitude.values.tolist()

cellId1 = []
cellId2 = []
cellId3 = []
for i in range(0,len(lat)):
    p1 = s2sphere.LatLng.from_degrees(lat[i], lon[i])
    cell = s2sphere.CellId.from_lat_lng(p1)
    cid = str(cell.id())
    #print(cid)
    cellId1.append(int(cid[:6]))
    cellId2.append(int(cid[6:12]))
    cellId3.append(int(cid[12:19]))
    

se = pd.Series(cellId1)
test_df['cellId1'] = se.values

se = pd.Series(cellId2)
test_df['cellId2'] = se.values

se = pd.Series(cellId1)
test_df['cellId3'] = se.values
features_to_use.extend(['cellId1','cellId2','cellId3'])

In [5]:
from textblob import TextBlob
train_df['Des_TextBlob']=train_df.description.apply(lambda x: TextBlob(x))
test_df['Des_TextBlob']=test_df.description.apply(lambda x: TextBlob(x))

In [6]:
train_df['des_sentiment']=train_df['Des_TextBlob'].apply(lambda x: x.sentiment.polarity)
test_df['des_sentiment']=test_df['Des_TextBlob'].apply(lambda x: x.sentiment.polarity)

train_df['des_subjectivity']=train_df['Des_TextBlob'].apply(lambda x: x.sentiment.polarity)
test_df['des_subjectivity']=test_df['Des_TextBlob'].apply(lambda x: x.sentiment.polarity)

features_to_use.extend(['des_sentiment','des_subjectivity'])

In [7]:
train_df['noun_phrases_count']=train_df.Des_TextBlob.apply(lambda x: len(x.noun_phrases))
test_df['noun_phrases_count']=test_df.Des_TextBlob.apply(lambda x: len(x.noun_phrases))
features_to_use.append('noun_phrases_count') 

In [8]:
#train_df['num_IMPORTANT']=0
train_df['num_NOTICE']=0
#test_df['num_IMPORTANT']=0
test_df['num_NOTICE']=0

#train_df['num_IMPORTANT'].ix[train_df['description'].str.contains('IMPORTANT')] = 1
test_df['num_NOTICE'].ix[test_df['description'].str.contains('NOTICE')] = 1

#train_df['num_IMPORTANT'].ix[train_df['description'].str.contains('IMPORTANT')] = 1
test_df['num_NOTICE'].ix[test_df['description'].str.contains('NOTICE')] = 1

In [9]:
features_to_use.extend(['num_NOTICE']) 

In [10]:
#magic features
image_date = pd.read_csv("../input/listing_image_time.csv")

# rename columns so you can join tables later on
image_date.columns = ["listing_id", "time_stamp"]

# reassign the only one timestamp from April, all others from Oct/Nov
image_date.loc[80240,"time_stamp"] = 1478129766 

image_date["img_date"]                  = pd.to_datetime(image_date["time_stamp"], unit="s")
image_date["img_days_passed"]           = (image_date["img_date"].max() - image_date["img_date"]).astype("timedelta64[D]").astype(int)
image_date["img_date_month"]            = image_date["img_date"].dt.month
image_date["img_date_week"]             = image_date["img_date"].dt.week
image_date["img_date_day"]              = image_date["img_date"].dt.day
image_date["img_date_dayofweek"]        = image_date["img_date"].dt.dayofweek
image_date["img_date_dayofyear"]        = image_date["img_date"].dt.dayofyear
image_date["img_date_hour"]             = image_date["img_date"].dt.hour
image_date["img_date_monthBeginMidEnd"] = image_date["img_date_day"].apply(lambda x: 1 if x<10 else 2 if x<20 else 3)

train_df = pd.merge(train_df, image_date, on="listing_id", how="left")
test_df = pd.merge(test_df, image_date, on="listing_id", how="left")

In [11]:
megic_features=['img_days_passed','img_date_month','img_date_week','img_date_day','img_date_dayofweek','img_date_dayofyear','img_date_hour','img_date_monthBeginMidEnd']
features_to_use.extend(megic_features)

In [16]:
temp = pd.concat([train_df.manager_id,pd.get_dummies(y_train)], axis = 1).groupby('manager_id').mean()
temp.columns = ['high_frac','low_frac', 'medium_frac']
temp['count'] = train_df.groupby('manager_id').count().iloc[:,1]
# compute skill
temp['manager_skill'] = temp['high_frac']*2 + temp['medium_frac']

# get ixes for unranked managers...
unranked_managers_ixes = temp['count']<20
# ... and ranked ones
ranked_managers_ixes = ~unranked_managers_ixes

# compute mean values from ranked managers and assign them to unranked ones
mean_values = temp.loc[ranked_managers_ixes, ['high_frac','low_frac', 'medium_frac','manager_skill']].mean()
temp.loc[unranked_managers_ixes,['high_frac','low_frac', 'medium_frac','manager_skill']] = mean_values.values

train_df = train_df.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')

test_df = test_df.merge(temp.reset_index(),how='left', left_on='manager_id', right_on='manager_id')

features_to_use.extend(['high_frac','low_frac', 'medium_frac','manager_skill'])

In [25]:
savecol=[col for col in train_df.columns if col not in ['Des_TextBlob']]
train_df[savecol].to_pickle("train.pkl")
savecol.remove('interest_level')
test_df[savecol].to_pickle("test.pkl")

In [43]:
features_to_use.extend(['high_frac','low_frac', 'medium_frac','manager_skill'])

In [41]:
features_to_use= [col for col in features_to_use if col not in ['high_frac','low_frac', 'medium_frac','manager_skill']]

In [14]:
test_df['interest_level']='test'

In [15]:
train_test=pd.concat([train_df,test_df],axis=0)

In [16]:
managers_count = train_test['manager_id'].value_counts()

train_test['top_10_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
train_test['top_25_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
train_test['top_5_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
train_test['top_50_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
train_test['top_1_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
train_test['top_2_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
train_test['top_15_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
train_test['top_20_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
train_test['top_30_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 70)] else 0)

buildings_count = train_test['building_id'].value_counts()

train_test['top_10_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
train_test['top_25_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
train_test['top_5_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
train_test['top_50_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
train_test['top_1_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
train_test['top_2_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
train_test['top_15_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
train_test['top_20_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
train_test['top_30_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)

In [17]:
top_manager_features=['top_10_manager','top_25_manager','top_5_manager','top_50_manager','top_1_manager','top_2_manager','top_15_manager','top_20_manager','top_30_manager']
top_buildings_features=['top_10_building','top_25_building','top_5_building','top_50_building','top_1_building','top_2_building','top_15_building','top_20_building','top_30_building']

In [18]:
features_to_use.extend(top_manager_features)
features_to_use.extend(top_buildings_features)

In [19]:
train_df=train_test[train_test['interest_level']!='test']
test_df=train_test[train_test['interest_level']=='test']

savecol=[col for col in train_df.columns if col not in ['Des_TextBlob']]
train_df[savecol].to_pickle("train.pkl")
savecol.remove('interest_level')
test_df[savecol].to_pickle("test.pkl")

In [41]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

#preds, model = runlgbm(train_X, train_y)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, test_size=0.20, random_state=42)
preds, model = runXGB(X_train, y_train,X_test,y_test,num_rounds=500000)

[0]	train-mlogloss:1.08457	test-mlogloss:1.08492
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[50]	train-mlogloss:0.727437	test-mlogloss:0.741543
[100]	train-mlogloss:0.609018	test-mlogloss:0.633575
[150]	train-mlogloss:0.558608	test-mlogloss:0.591715
[200]	train-mlogloss:0.528126	test-mlogloss:0.56888
[250]	train-mlogloss:0.507305	test-mlogloss:0.555635
[300]	train-mlogloss:0.489207	test-mlogloss:0.546051
[350]	train-mlogloss:0.47378	test-mlogloss:0.538923
[400]	train-mlogloss:0.461052	test-mlogloss:0.533878
[450]	train-mlogloss:0.449469	test-mlogloss:0.529993
[500]	train-mlogloss:0.439263	test-mlogloss:0.52698
[550]	train-mlogloss:0.42956	test-mlogloss:0.524603
[600]	train-mlogloss:0.420904	test-mlogloss:0.522613
[650]	train-mlogloss:0.412925	test-mlogloss:0.521006
[700]	train-mlogloss:0.405077	test-mlogloss:0.519513
[750]	train-mlogloss:0.398104	test-mlogloss:0.518395
[800]	trai

In [42]:
preds, model = runXGB(train_X, train_y,test_X,num_rounds=1339)
#preds=model.predict_proba(test_X)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_startersinglemodel.csv", index=False)

In [43]:
interest_levels = ['low', 'medium', 'high']

tau_train = {
    'low': 0.694683, 
    'medium': 0.227529,
    'high': 0.077788, 
}

tau_test = {
    'low': 0.69195995, 
    'medium': 0.23108864,
    'high': 0.07695141, 
}

def correct(df, train=True, verbose=False):
    if train:
        tau = tau_train
    else:
        tau = tau_test
        
    df_sum = df[interest_levels].sum(axis=1)
    df_correct = df[interest_levels].copy()
    
    if verbose:
        y = df_correct.mean()
        a = [tau[k] / y[k]  for k in interest_levels]
        print( a)
    
    for c in interest_levels:
        df_correct[c] /= df_sum

    for i in range(20):
        for c in interest_levels:
            df_correct[c] *= tau[c] / df_correct[c].mean()

        df_sum = df_correct.sum(axis=1)

        for c in interest_levels:
            df_correct[c] /= df_sum
    
    if verbose:
        y = df_correct.mean()
        a = [tau[k] / y[k]  for k in interest_levels]
        print( a)

    return df_correct

In [44]:
df=correct(out_df)

In [45]:
df.to_csv("better_prior.csv",index=False)

In [33]:
train_stack=pd.concat([train_df[['interest_level']+features_to_use],pd.DataFrame(tr_sparse.todense())],axis=1)
train_stack.to_csv("processed_train.csv",header=False,index=False)
test_stack=pd.concat([test_df[features_to_use],pd.DataFrame(tr_sparse.todense())],axis=1)
test_stack.to_csv("processed_test.csv",header=False,index=False)

In [39]:
train_stack['interest_level']=train_df['interest_level'].apply(lambda x: target_num_map[x])

In [40]:
train_stack.to_csv("processed_train.csv",header=False,index=False)

In [48]:
test_stack.shape,train_stack.shape

((74659, 271), (49352, 272))

In [55]:
stackpred=pd.read_csv("sigma_stack_pred.csv")

In [57]:
stackpred['listing_id']=test_stack.listing_id

In [59]:
stackpred[["listing_id","high", "medium", "low"]].to_csv("preds_stack.csv",index=False)

In [60]:
stackpred.shape

(74659, 4)