In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import time
import matplotlib
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

RANDOM_STATE = 1992

In [2]:
train = pd.read_csv("data/train_ft_sel_30.csv", encoding='utf-8', sep=',')

test = pd.read_csv("data/test_ft_sel_30.csv", encoding='utf-8', sep=',')

In [3]:
train.head()

Unnamed: 0,OverallQual,floor,LotArea,GarageArea,BsmtFinSF1,1stFlrSF,TotalBsmtSF,GrLivArea,OpenPorchSF,BsmtUnfSF,...,2ndFlrSF,SaleCondition,kn_128,WoodDeckSF,MSZoning,Quality,ScreenPorch,kn_32,Id,SalePrice
0,7,0.368719,-0.205996,0.352525,0.579345,-0.793908,-0.457576,0.372562,0.220747,-0.943764,...,1.162628,4,34,-0.752932,3,0.0,-0.271106,5,1,5.319106
1,6,-0.533735,-0.090876,-0.058502,1.176868,0.260994,0.469865,-0.479767,-0.702973,-0.640635,...,-0.794728,4,102,1.623585,3,0.0,-0.271106,28,2,5.258877
2,7,0.562513,0.074297,0.63277,0.096054,-0.627618,-0.311378,0.517154,-0.066969,-0.301312,...,1.190131,4,34,-0.752932,3,0.0,-0.271106,5,3,5.349278
3,7,0.416924,-0.095881,0.791576,-0.497076,-0.521089,-0.68601,0.38588,-0.17297,-0.061524,...,0.938013,0,20,-0.752932,3,0.1008,-0.271106,24,4,5.146128
4,8,1.575728,0.375612,1.697704,0.467309,-0.043005,0.202598,1.300993,0.569034,-0.174632,...,1.618733,4,85,0.778247,3,0.0,-0.271106,1,5,5.39794


In [4]:
column_list = test.columns[test.columns != 'Id']
column_list

Index(['OverallQual', 'floor', 'LotArea', 'GarageArea', 'BsmtFinSF1',
       '1stFlrSF', 'TotalBsmtSF', 'GrLivArea', 'OpenPorchSF', 'BsmtUnfSF',
       'YearBuilt', 'LtArea-Frontage', 'OverallCond', 'FullBath',
       'LotFrontage', 'GarageYrBlt', 'kn_256', 'YearRemodAdd', 'kn_512',
       'MasVnrArea', 'MoSold', 'Neighborhood', '2ndFlrSF', 'SaleCondition',
       'kn_128', 'WoodDeckSF', 'MSZoning', 'Quality', 'ScreenPorch', 'kn_32'],
      dtype='object')

In [5]:
min_sp = train['SalePrice'].min()
max_sp = train['SalePrice'].max()

In [6]:
diff = max_sp-min_sp

In [7]:
diff/8*8+min_sp

5.8779469516291885

In [8]:
min_sp

4.54282542695918

In [9]:
lista = ['class_4', 'class_8', 'class_16', 'class_32']
clases = [4, 8, 16, 32]

for n in range(len(clases)):
    labeles = list()
    ranges = list()
    m = clases[n]
    step = diff/m
    for i in range(m):
        labeles.append(i)
        if i == 0:
            ranges.append(0)
        else:
            ranges.append(min_sp+(step*(i)))
    ranges.append(10)
    train[lista[n]] = pd.cut(train['SalePrice'], ranges, labels=labeles)

In [10]:
train.loc[:,lista].head()

Unnamed: 0,class_4,class_8,class_16,class_32
0,2,4,9,18
1,2,4,8,17
2,2,4,9,19
3,1,3,7,14
4,2,5,10,20


In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train.loc[:,column_list],
                                                    list(train['class_4']),
                                                    test_size=0.3,
                                                    random_state=RANDOM_STATE)

In [24]:
for num in lista:
    X_train, X_test, y_train, y_test = train_test_split(train.loc[:,column_list],
                                                        list(train[num]),
                                                        test_size=0.3,
                                                        random_state=RANDOM_STATE)

    param_xgb = {'objective': ['multi:softmax'],
                 "learning_rate": [0.1, 0.3],
                 "max_depth": [5, 8, 10],
                 "n_estimators": [300, 400, 600],
                 'subsample': [0.6, 0.8, 0.5],
                 "colsample_bytree": [0.8],
                 "colsample_bylevel": [0.8]
                 }

    estimator_xgb = XGBClassifier()
    xgboost_model = GridSearchCV(estimator=estimator_xgb,
                                 param_grid=param_xgb,
                                 n_jobs=-1,
                                 cv=5,
                                 verbose=0)

    xgboost_model.fit(X_train, y_train)
    preds = xgboost_model.best_estimator_.predict(X_test)
    f1s = f1_score(y_test, preds, average='micro')
    print('XGBoost {} f1_score in train data:'.format(num),f1s)
    
    test[num] = xgboost_model.best_estimator_.predict(test.loc[:,column_list]) 

XGBoost class_4 f1_score in train data: 0.8646788990825689
XGBoost class_8 f1_score in train data: 0.7155963302752294




XGBoost class_16 f1_score in train data: 0.48394495412844035




XGBoost class_32 f1_score in train data: 0.25688073394495414


In [21]:
f1s = f1_score(y_test, preds, average='micro')
print('XGBoost f1_score:',f1s)

XGBoost f1_score: 0.8646788990825689


In [29]:
test.loc[:,column_list].head()

Unnamed: 0,OverallQual,floor,LotArea,GarageArea,BsmtFinSF1,1stFlrSF,TotalBsmtSF,GrLivArea,OpenPorchSF,BsmtUnfSF,...,MoSold,Neighborhood,2ndFlrSF,SaleCondition,kn_128,WoodDeckSF,MSZoning,Quality,ScreenPorch,kn_32
0,5,-1.429815,0.363929,1.185921,0.063428,-0.654561,-0.370716,-1.215588,-0.701628,-0.650461,...,-0.038281,12,-0.775254,4,89,0.366678,2,0.20736,1.81896,2
1,6,-0.341955,0.897861,-0.741235,1.063511,0.433298,0.63923,-0.323539,-0.178826,-0.339225,...,-0.038281,12,-0.775254,4,72,2.347867,3,0.165888,-0.301543,2
2,5,0.317779,0.809646,0.042537,0.773377,-0.574165,-0.266784,0.294508,-0.207871,-0.954831,...,-1.140614,8,0.891944,4,90,0.930495,3,0.072,-0.301543,5
3,6,0.258053,0.032064,-0.012788,0.357958,-0.57919,-0.271303,0.243004,-0.178826,-0.526882,...,-0.038281,8,0.837243,4,90,2.089451,3,0.0,-0.301543,5
4,8,-0.465062,-0.971808,0.153187,-0.38716,0.310192,0.52852,-0.424487,0.489198,1.059048,...,-1.875504,22,-0.775254,4,29,-0.729632,3,0.0,2.24306,3
