In [1]:
import pandas as pd
import numpy as np

In [2]:
column_names = ['TPR','TNR','FPR','FNR','AUC','G-Mean','F1', 'G-Measure']
report = pd.DataFrame(columns = column_names)
columns = ['Año',
 'Kilometros',
 'Combustible',
 'Mano',
 'Consumo',
 'Motor_CC',
 'Potencia',
 'Asientos',
 'Descuento']

In [3]:
import math

def cmdata_report(cm, classifier, data):# add a df as parameter
    total = cm.sum()
    TP = cm[0,0]
    FN = cm[0,1]
    FP = cm[1,0]
    TN = cm[1,1]
    TPR = TP/(TP+FN)
    TNR = TN/(TN+FP)
    FPR = FP/(FP+TN)
    FNR = FN/(FN+TP)
    ratio = (cm[0,0]+cm[1,1])/total
    PPV = TP/(TP+FP)
    AUC = (1 + TPR - FPR)/2
    GMEAN = math.sqrt(TNR*TPR)
    F1 = 2*TP/(2*TP+FP+FN)
    GMEASURE = math.sqrt(PPV*TPR)
    
    s = pd.Series([TPR,TNR,FPR,FNR,AUC,GMEAN,F1,GMEASURE], name=classifier, index=report.columns)
    df = data.append(s)
    return df

In [4]:
def get_classification_report(y_test, y_pred):
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

In [5]:
train = pd.read_csv("../csv/train.csv", na_values=["?"])
test = pd.read_csv("../csv/test.csv", na_values=["?"])
print(test.shape)

(1159, 13)


In [6]:
cols = train.columns.tolist()
print(cols)

['id', 'Nombre', 'Ciudad', 'Año', 'Kilometros', 'Combustible', 'Tipo_marchas', 'Mano', 'Consumo', 'Motor_CC', 'Potencia', 'Asientos', 'Descuento', 'Precio_cat']


In [7]:
cols = train.columns.tolist()
for i in cols:
    print(i)
    print(train[i].unique())

id
[1.000e+00 2.000e+00 3.000e+00 ... 4.817e+03 4.818e+03 4.819e+03]
Nombre
['Maruti Swift Dzire ZDI' 'Maruti Wagon R LXI Optional'
 'Mahindra KUV 100 mFALCON D75 K8' ...
 'Mercedes-Benz E-Class 280 Elegance' 'Honda City Corporate Edition'
 'Porsche Panamera Diesel 250hp']
Ciudad
['G' 'I' 'F' 'E' 'H' 'C' 'J' 'L' 'B' 'K' 'D' nan]
Año
[2012. 2016. 2013. 2017. 2009. 2014. 2011. 2015. 2008. 2010. 2018. 2006.
   nan 2019. 2005. 2007. 2003. 2004. 2001. 1998. 2002. 2000. 1999.]
Kilometros
[83000.  4800. 26000. ... 60033. 40158. 65743.]
Combustible
['Diesel' 'Petrol' nan 'CNG' 'LPG' 'Electric']
Tipo_marchas
['Manual' 'Automatic' nan]
Mano
['First' 'Second' 'Third' nan 'Fourth & Above']
Consumo
['23.4 kmpl' '20.51 kmpl' '25.32 kmpl' '18.5 kmpl' '18.7 kmpl' '17.8 kmpl'
 '16.8 kmpl' '18.6 kmpl' '25.2 kmpl' '10.91 kmpl' '12.99 kmpl' '17.5 kmpl'
 '25.8 kmpl' '18.25 kmpl' '10.0 kmpl' '11.5 kmpl' '19.1 kmpl' '14.21 kmpl'
 '14.16 kmpl' '17.43 kmpl' '18.0 kmpl' '15.7 kmpl' '16.0 kmpl'
 '21.43 kmpl' '21

In [8]:
train.head()

Unnamed: 0,id,Nombre,Ciudad,Año,Kilometros,Combustible,Tipo_marchas,Mano,Consumo,Motor_CC,Potencia,Asientos,Descuento,Precio_cat
0,1.0,Maruti Swift Dzire ZDI,G,2012.0,83000.0,Diesel,Manual,First,23.4 kmpl,1248 CC,74 bhp,5.0,,3
1,2.0,Maruti Wagon R LXI Optional,I,2016.0,4800.0,Petrol,Manual,First,20.51 kmpl,998 CC,67.04 bhp,5.0,,3
2,3.0,Mahindra KUV 100 mFALCON D75 K8,F,2016.0,26000.0,Diesel,Manual,First,25.32 kmpl,1198 CC,77 bhp,6.0,,3
3,4.0,Hyundai i20 1.2 Magna,E,2013.0,56127.0,Petrol,Manual,First,18.5 kmpl,1197 CC,80 bhp,5.0,,3
4,5.0,Honda Jazz 1.2 SV i VTEC,H,2017.0,41981.0,Petrol,Manual,First,18.7 kmpl,1199 CC,88.7 bhp,5.0,,3


# Preprocessing data

First of all, we get all numbers from string format

In [9]:
import re
import math

for index, row in train.iterrows():
    #print(row.Consumo)
    consumo = row.Consumo
    motor = row.Motor_CC
    potencia = row.Potencia
    
    if type(consumo) == str:
        train.loc[index,'Consumo'] = float(re.findall(r"^\d+\.\d+", consumo)[0])
    if type(motor) == str:
        train.loc[index,'Motor_CC'] = float(re.findall(r"^\d+", motor)[0])
    if type(potencia) == str:
        train.loc[index,'Potencia'] = float(re.findall(r"^\d+", potencia)[0])
        
for index, row in test.iterrows():
    #print(row.Consumo)
    consumo = row.Consumo
    motor = row.Motor_CC
    potencia = row.Potencia
    
    if type(consumo) == str:
        test.loc[index,'Consumo'] = float(re.findall(r"^\d+\.\d+", consumo)[0])
    if type(motor) == str:
        test.loc[index,'Motor_CC'] = float(re.findall(r"^\d+", motor)[0])
    if type(potencia) == str:
        test.loc[index,'Potencia'] = float(re.findall(r"^\d+", potencia)[0])
        
train['Consumo'] = train['Consumo'].astype(float)
train['Motor_CC'] = train['Motor_CC'].astype(float)
train['Potencia'] = train['Potencia'].astype(float)

test['Consumo'] = test['Consumo'].astype(float)
test['Motor_CC'] = test['Motor_CC'].astype(float)
test['Potencia'] = test['Potencia'].astype(float)

train.head()

Unnamed: 0,id,Nombre,Ciudad,Año,Kilometros,Combustible,Tipo_marchas,Mano,Consumo,Motor_CC,Potencia,Asientos,Descuento,Precio_cat
0,1.0,Maruti Swift Dzire ZDI,G,2012.0,83000.0,Diesel,Manual,First,23.4,1248.0,74.0,5.0,,3
1,2.0,Maruti Wagon R LXI Optional,I,2016.0,4800.0,Petrol,Manual,First,20.51,998.0,67.0,5.0,,3
2,3.0,Mahindra KUV 100 mFALCON D75 K8,F,2016.0,26000.0,Diesel,Manual,First,25.32,1198.0,77.0,6.0,,3
3,4.0,Hyundai i20 1.2 Magna,E,2013.0,56127.0,Petrol,Manual,First,18.5,1197.0,80.0,5.0,,3
4,5.0,Honda Jazz 1.2 SV i VTEC,H,2017.0,41981.0,Petrol,Manual,First,18.7,1199.0,88.0,5.0,,3


In [10]:
train.Descuento = train.Descuento.fillna(0)
test.Descuento = test.Descuento.fillna(0)

In [11]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data_without_nan = train.copy()
data_without_nan = data_without_nan.dropna()
data_without_nan['Combustible'] = le.fit_transform(data_without_nan['Combustible'])
data_without_nan['Mano'] = le.fit_transform(data_without_nan['Mano'])

test['Combustible'] = le.fit_transform(test['Combustible'])
test['Mano'] = le.fit_transform(test['Mano'])

In [12]:
data_without_nan.head()

Unnamed: 0,id,Nombre,Ciudad,Año,Kilometros,Combustible,Tipo_marchas,Mano,Consumo,Motor_CC,Potencia,Asientos,Descuento,Precio_cat
0,1.0,Maruti Swift Dzire ZDI,G,2012.0,83000.0,1,Manual,0,23.4,1248.0,74.0,5.0,0.0,3
1,2.0,Maruti Wagon R LXI Optional,I,2016.0,4800.0,3,Manual,0,20.51,998.0,67.0,5.0,0.0,3
2,3.0,Mahindra KUV 100 mFALCON D75 K8,F,2016.0,26000.0,1,Manual,0,25.32,1198.0,77.0,6.0,0.0,3
3,4.0,Hyundai i20 1.2 Magna,E,2013.0,56127.0,3,Manual,0,18.5,1197.0,80.0,5.0,0.0,3
4,5.0,Honda Jazz 1.2 SV i VTEC,H,2017.0,41981.0,3,Manual,0,18.7,1199.0,88.0,5.0,0.0,3


# erase all missing data

In [13]:
data_without_nan = data_without_nan.drop('Tipo_marchas', axis=1).drop('Nombre', axis=1).drop('id', axis=1).drop('Ciudad', axis=1)
test_id = test.id
test = test.drop('Tipo_marchas', axis=1).drop('Nombre', axis=1).drop('id', axis=1).drop('Ciudad', axis=1)
target = data_without_nan.Precio_cat

X = data_without_nan.drop('Precio_cat', axis=1)
y = data_without_nan['Precio_cat']

Oversampling

In [14]:
'''
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
'''

'\nfrom imblearn.over_sampling import SMOTE\noversample = SMOTE()\nX, y = oversample.fit_resample(X, y)\n'

Now, train the model

In [15]:
import random
# get random seed parameter for classifiers
random_seed = random.randrange(2**32 - 1)
seed = random_seed
# seed used in first kaggle submission
seed = 2232527064
print("random_seed: "+str(random_seed))
from sklearn.model_selection import StratifiedKFold
cv_ = StratifiedKFold(n_splits=5, shuffle=True)
print("Actual seed: "+str(seed))

random_seed: 2526981992
Actual seed: 2232527064


In [16]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#model = RandomForestClassifier(random_state=random_seed)
model = XGBClassifier()

from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(model, X, y, cv=cv_, n_jobs=-1)

from sklearn.metrics import classification_report, confusion_matrix
rf_cmatrix = confusion_matrix(y, y_pred)
rf_report = get_classification_report(y, y_pred)
report = cmdata_report(rf_cmatrix, 'Random Forest', report)

In [17]:
report

Unnamed: 0,TPR,TNR,FPR,FNR,AUC,G-Mean,F1,G-Measure
Random Forest,0.613757,0.881029,0.118971,0.386243,0.747393,0.735348,0.678363,0.682152


In [18]:
rf_report

Unnamed: 0,precision,recall,f1-score,support
5,0.91746,0.920382,0.918919,628.0
3,0.823406,0.876529,0.849138,1798.0
accuracy,0.805373,0.805373,0.805373,0.805373
weighted avg,0.801825,0.805373,0.802596,3946.0
4,0.802532,0.767554,0.784653,826.0
macro avg,0.775563,0.740549,0.756004,3946.0
1,0.72956,0.585859,0.64986,198.0
2,0.604857,0.552419,0.57745,496.0


And get the prediction

In [19]:
model.fit(data_without_nan.drop('Precio_cat', axis=1), target)
prediction = model.predict(test)





In [20]:
prediction

array([5, 2, 2, ..., 5, 5, 5])

In [21]:
test_id

0       4820
1       4821
2       4822
3       4823
4       4824
        ... 
1154    5974
1155    5975
1156    5976
1157    5977
1158    5978
Name: id, Length: 1159, dtype: int64

Get csv submission

In [22]:
submit = pd.DataFrame(columns = ['id', 'Precio_cat'])
submit['Precio_cat'] = prediction.tolist()
submit['id'] = test_id.tolist()

In [23]:
submit

Unnamed: 0,id,Precio_cat
0,4820,5
1,4821,2
2,4822,2
3,4823,3
4,4824,3
...,...,...
1154,5974,2
1155,5975,3
1156,5976,5
1157,5977,5


In [24]:
submit.to_csv('submission.csv', index=False)

In [25]:
test.columns.tolist()

['Año',
 'Kilometros',
 'Combustible',
 'Mano',
 'Consumo',
 'Motor_CC',
 'Potencia',
 'Asientos',
 'Descuento']