In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

### Read Data and Drop location features and concatenate model_info with additional_description

In [2]:
df_train = pd.read_csv("Train.csv")
df_test = pd.read_csv("Test.csv")

df_train.drop(["Additional_Description"], axis=1, inplace=True)
df_test.drop(["Additional_Description"], axis=1, inplace=True)

df_train.head()
# df_test.head()

Unnamed: 0,Brand,Model_Info,Locality,City,State,Price
0,1,name0 name234 64gb space grey,878,8,2,15000
1,1,phone 7 name42 name453 new condition box acce...,1081,4,0,18800
2,1,name0 x 256gb leess used good condition,495,11,4,50000
3,1,name0 6s plus 64 gb space grey,287,10,7,16500
4,1,phone 7 sealed pack brand new factory outet p...,342,4,0,26499


### Functions to create new features

In [3]:
def extract_num(text1, text2):
    # So rom is either current or previous
    try:
        space = int(''.join(filter(str.isdigit, text2)))
    except:
        space = int(''.join(filter(str.isdigit, text1)))
    return space

def get_rom(model_info):
    rom = list()
    for index, text in enumerate(model_info):
        space = ""
        for i in range(len(text)):
            if "gb" in text[i] or "gig" in text[i]:
                # Make sure "ram" is not to the right
                if i < len(text)-1 and "ram" not in text[i+1]:
                    space = extract_num(text[i-1], text[i])
                elif i == len(text)-1:
                    space = extract_num(text[i-1], text[i])
                elif i == 0:
                    space = extract_num(None, text[i])
        if type(space) == int:
            rom.append(space)
        else:
            rom.append(0)
    return rom


def get_ram(model_info):
    ram = list()
    for index, text in enumerate(model_info):
        space = ""
        for i in range(len(text)):
            if "gb" in text[i] or "gig" in text[i]:
                # Make sure "ram" is to the right
                if i < len(text)-1 and "ram" in text[i+1]:
                    space = extract_num(text[i-1], text[i])
        if type(space) == int:
                ram.append(space)
        else:
            ram.append(0)
    return ram


def get_warranty(model_info):
    warranty = list()
    for text in model_info:
        if "war" in text:
            warranty.append(1)
        else:
            warranty.append(0)
    return warranty

def get_cash(model_info):
    cash = list()
    for text in model_info:
        if "cas" in text:
            cash.append(1)
        else:
            cash.append(0)
    return cash

def get_apple_product(brand_type, model_info):
    iphone_type, iwatch_type = list(), list()
    for brand, text in zip(brand_type, model_info):
        if brand == 1:
            if "watch" in text or "iwatch" in text:
                iphone_type.append(0)
                if "5" in text:
                    iwatch_type.append(5)
                elif "4" in text:
                    iwatch_type.append(4)
                elif "3" in text:
                    iwatch_type.append(3)
                elif "2" in text:
                    iwatch_type.append(2)
                elif "1" in text:
                    iwatch_type.append(1)
                else:
                    iwatch_type.append(1)
            else:
                iwatch_type.append(0)
                if ("11" in text or "eleven" in text or "elven" in text) and "pro" in text and "max" in text:
                    iphone_type.append(29)
                elif ("11" in text or "eleven" in text or "elven" in text) and "pro" in text:
                    iphone_type.append(28)
                elif "11" in text or "eleven" in text or "elven" in text:
                    iphone_type.append(27)
                elif "xs" in text and "max" in text:
                    iphone_type.append(26)
                elif "xs" in text:
                    iphone_type.append(25)
                elif "x" in text:
                    iphone_type.append(24)
                elif ("8s" in text and "plus" in text) or ("8" in text and "s" in text and "plus" in text) or "8splus" in text:
                    iphone_type.append(23)
                elif ("8" in text and "plus" in text) or "8plus" in text:
                    iphone_type.append(22)
                elif ("8" in text and "s" in text) or "8s" in text:
                    iphone_type.append(21)
                elif "8" in text:
                    try:
                        next_string = text[text.index("8")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(20)
                    except:
                        iphone_type.append(20)
                elif ("7s" in text and "plus" in text) or ("7" in text and "s" in text and "plus" in text) or "7splus" in text:
                    iphone_type.append(19)
                elif ("7" in text and "plus" in text) or "7plus" in text:
                    iphone_type.append(18)
                elif ("7" in text and "s" in text) or "7s" in text:
                    iphone_type.append(17)
                elif "7" in text:
                    try:
                        next_string = text[text.index("7")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(16)
                    except:
                        iphone_type.append(16)
                elif ("6s" in text and "plus" in text) or ("6" in text and "s" in text and "plus" in text) or "6splus" in text:
                    iphone_type.append(15)
                elif ("6" in text and "plus" in text) or "6plus" in text:
                    iphone_type.append(14)
                elif ("6" in text and "s" in text) or "6s" in text:
                    iphone_type.append(13)
                elif "6" in text:
                    try:
                        next_string = text[text.index("6")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(12)
                    except:
                        iphone_type.append(12)
                elif ("5s" in text and "plus" in text) or ("5" in text and "s" in text and "plus" in text) or "5splus" in text:
                    iphone_type.append(11)
                elif ("5" in text and "plus" in text) or "5plus" in text:
                    iphone_type.append(10)
                elif ("5" in text and "s" in text) or "5s" in text:
                    iphone_type.append(9)
                elif ("5" in text and "c" in text) or "5c" in text:
                    iphone_type.append(8)
                elif "5" in text:
                    try:
                        next_string = text[text.index("5")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(7)
                    except:
                        iphone_type.append(7)
                elif ("4s" in text and "plus" in text) or ("4" in text and "s" in text and "plus" in text) or "4splus" in text:
                    iphone_type.append(6)
                elif ("4" in text and "plus" in text) or "4plus" in text:
                    iphone_type.append(5)
                elif ("4" in text and "s" in text) or "4s" in text:
                    iphone_type.append(4)
                elif ("4" in text and "c" in text) or "4c" in text:
                    iphone_type.append(3)
                elif "4" in text:
                    try:
                        next_string = text[text.index("4")+1]
                        if "month" not in next_string or "year" not in next_string or "time" not in next_string:
                            iphone_type.append(2)
                    except:
                        iphone_type.append(2)
                else:
                    iphone_type.append(1)
        else:
            iphone_type.append(0)
            iwatch_type.append(0)
    return iphone_type, iwatch_type

### Create new features

In [4]:
df_train["Rom"] = get_rom([i.lower().split() for i in df_train["Model_Info"].values])
df_test["Rom"] = get_rom([i.lower().split() for i in df_test["Model_Info"].values])

df_train["Ram"] = get_ram([i.lower().split() for i in df_train["Model_Info"].values])
df_test["Ram"] = get_ram([i.lower().split() for i in df_test["Model_Info"].values])

df_train["Warranty"] = get_warranty(df_train["Model_Info"].values)
df_test["Warranty"] = get_warranty(df_test["Model_Info"].values)

df_train["Cash"] = get_cash(df_train["Model_Info"].values)
df_test["Cash"] = get_cash(df_test["Model_Info"].values)

df_train["iphone_type"], df_train["iwatch_type"] = get_apple_product(df_train["Brand"].values, [i.lower().split() for i in df_train["Model_Info"].values])
df_test["iphone_type"], df_test["iwatch_type"] = get_apple_product(df_test["Brand"].values, [i.lower().split() for i in df_test["Model_Info"].values])

df_train.head()

Unnamed: 0,Brand,Model_Info,Locality,City,State,Price,Rom,Ram,Warranty,Cash,iphone_type,iwatch_type
0,1,name0 name234 64gb space grey,878,8,2,15000,64,0,0,0,1,0
1,1,phone 7 name42 name453 new condition box acce...,1081,4,0,18800,0,0,0,0,16,0
2,1,name0 x 256gb leess used good condition,495,11,4,50000,256,0,0,0,24,0
3,1,name0 6s plus 64 gb space grey,287,10,7,16500,64,0,0,0,15,0
4,1,phone 7 sealed pack brand new factory outet p...,342,4,0,26499,0,0,0,0,16,0


### Adding My Features

In [30]:
df_train["phone_color"]=999
df_test["phone_color"]=999
for i,j in enumerate(df_train["Model_Info"]):
    if "red" in j:
        df_train["phone_color"][i]=1
    if "gold" in j:
        df_train["phone_color"][i]=2
    if "white" in j:
        df_train["phone_color"][i]=3
    if "black" in j:
        df_train["phone_color"][i]=4
    if "grey" in j:
        df_train["phone_color"][i]=5
    if "name103" in j:
        df_train["phone_color"][i]=6
    if "name182" in j:
        df_train["phone_color"][i]=7
    if "name114" in j:
        df_train["phone_color"][i]=8
    if "name120" in j:
        df_train["phone_color"][i]=9
    if "silver" in j:
        df_train["phone_color"][i]=10
for i,j in enumerate(df_test["Model_Info"]):
    if "red" in j:
        df_test["phone_color"][i]=1
    if "gold" in j:
        df_test["phone_color"][i]=2
    if "white" in j:
        df_test["phone_color"][i]=3
    if "black" in j:
        df_test["phone_color"][i]=4
    if "grey" in j:
        df_test["phone_color"][i]=5
    if "name103" in j:
        df_test["phone_color"][i]=6
    if "name182" in j:
        df_test["phone_color"][i]=7
    if "name114" in j:
        df_test["phone_color"][i]=8
    if "name120" in j:
        df_test["phone_color"][i]=9
    if "silver" in j:
        df_test["phone_color"][i]=10

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
tv = CountVectorizer(min_df=2,ngram_range=(1,2))
tv_matrix = tv.fit_transform(df_train["Model_Info"])
tv_matrix = tv_matrix.toarray()
# vocab = tv.get_feature_names()
# blah=pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)
# blah

tv_matrix2 = tv.transform(df_test["Model_Info"])
tv_matrix2 = tv_matrix2.toarray()
# vocab2 = tv.get_feature_names()
# blah2=pd.DataFrame(np.round(tv_matrix2, 2), columns=vocab2)
# blah

### Convert to numpy arrays

In [31]:
brand_ohe = OneHotEncoder(sparse=False, drop="first")

brand_train = brand_ohe.fit_transform(np.reshape(df_train["Brand"].values, (-1, 1)))
rom_train = np.reshape(df_train["Rom"].values, (-1, 1))
ram_train = np.reshape(df_train["Ram"].values, (-1, 1))
warranty_train = np.reshape(df_train["Warranty"].values, (-1, 1))
cash_train = np.reshape(df_train["Cash"].values, (-1, 1))
iphone_type_train = np.reshape(df_train["iphone_type"].values, (-1, 1))
phone_color_train = np.reshape(df_train["phone_color"].values, (-1, 1))
X_train = np.concatenate((brand_train, rom_train, ram_train, warranty_train,
                          cash_train, iphone_type_train,tv_matrix,phone_color_train), axis=1)

brand_test = brand_ohe.transform(np.reshape(df_test["Brand"].values, (-1, 1)))
rom_test = np.reshape(df_test["Rom"].values, (-1, 1))
ram_test = np.reshape(df_test["Ram"].values, (-1, 1))
warranty_test = np.reshape(df_test["Warranty"].values, (-1, 1))
cash_test = np.reshape(df_test["Cash"].values, (-1, 1))
iphone_type_test = np.reshape(df_test["iphone_type"].values, (-1, 1))
phone_color_test = np.reshape(df_test["phone_color"].values, (-1, 1))
X_test = np.concatenate((brand_test, rom_test, ram_test, warranty_test,
                         cash_test, iphone_type_test,tv_matrix2,phone_color_test), axis=1)

Y = df_train["Price"].values
# Y_classes = [math.ceil(i/20000)-1 for i in df_train["Price"].values]
Y_classes = pd.cut(df_train['Price'], bins=5).values
# Y_classes = pd.qcut(df_train['Price'], q=3, precision=0).values
print(Y_classes.value_counts())
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y_classes = le.fit_transform(Y_classes)
print(X_train.shape, X_test.shape, Y.shape)

(269.401, 26318.8]      1604
(26318.8, 52238.6]       449
(52238.6, 78158.4]       191
(78158.4, 104078.2]       64
(104078.2, 129998.0]      18
dtype: int64
(2326, 1728) (997, 1728) (2326,)


### Cross-Validation 1

In [33]:
from imblearn.combine import SMOTETomek,SMOTEENN
from imblearn.over_sampling import *
from collections import Counter

kfold, scores = StratifiedKFold(n_splits=3, shuffle=True, random_state=27), list()
for train, test in kfold.split(X_train,Y_classes):
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = np.log(Y[train]), Y[test]
    y_train_bins, y_test_bins = Y_classes[train], Y_classes[test]
    
    model = CatBoostRegressor(random_state=27, verbose=500)
    #, cat_features=[0])
    
    X_ = np.concatenate((x_train,np.reshape(y_train,(-1,1))),axis=1)
    
    num_class1, num_class2 = Counter(y_train_bins)[0],Counter(y_train_bins)[1]
    num_class3, num_class4 = Counter(y_train_bins)[2],Counter(y_train_bins)[3]
    num_class5 = Counter(y_train_bins)[4]
    smote = RandomOverSampler(random_state=27, sampling_strategy={0: int(1.2*num_class1),
                                                   1: int(1.3*num_class2),
                                                   2: int(1.4*num_class3),
                                                   3: int(1.7*num_class4),
                                                   4: int(2*num_class5)})
    X_,_ = smote.fit_resample(X_,y_train_bins)
    
#     model = LGBMRegressor(random_state=27, verbose=500)#, max_depth=4, n_estimators=400)
    model.fit(X_[:,:-1], X_[:,-1])
    preds = np.exp(model.predict(x_test))
#     model = XGBRegressor(random_state=27, n_jobs=-1, objective="reg:squarederror", max_depth=6, n_estimators=100)
#     model.fit(x_train, y_train)
#     preds = np.exp(model.predict(x_test))
    
    score = np.sqrt(mean_squared_log_error(y_test, preds))
#     score = mean_squared_log_error(y_test, preds)
    print(score)
    scores.append(score)
print("Average: ", sum(scores)/len(scores))

Learning rate set to 0.043933
0:	learn: 0.8423206	total: 17.5ms	remaining: 17.5s
500:	learn: 0.3433378	total: 8.49s	remaining: 8.46s
999:	learn: 0.2943750	total: 16.9s	remaining: 0us
0.4128482830568511
Learning rate set to 0.043941
0:	learn: 0.8574670	total: 18.1ms	remaining: 18.1s
500:	learn: 0.3234352	total: 9.75s	remaining: 9.71s
999:	learn: 0.2652190	total: 18.6s	remaining: 0us
0.49292488780943694
Learning rate set to 0.043941
0:	learn: 0.8271087	total: 19.4ms	remaining: 19.4s
500:	learn: 0.3213596	total: 8.78s	remaining: 8.75s
999:	learn: 0.2704506	total: 17.5s	remaining: 0us
0.460954038164052
Average:  0.4555757363434467


### Cross-Validation 2

In [23]:
kfold, scores = KFold(n_splits=10, shuffle=True, random_state=0), list()
for train, test in kfold.split(X_train):
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = np.log(Y[train]), Y[test]
    y_train_bins, y_test_bins = Y_classes[train], Y_classes[test]

    
    model = CatBoostRegressor(random_state=27, verbose=500)
    X_ = np.concatenate((x_train,np.reshape(y_train,(-1,1))),axis=1)
    
    num_class1, num_class2 = Counter(y_train_bins)[0],Counter(y_train_bins)[1]
    num_class3, num_class4 = Counter(y_train_bins)[2],Counter(y_train_bins)[3]
    num_class5 = Counter(y_train_bins)[4]
    smote = RandomOverSampler(random_state=27, sampling_strategy={0: int(1.2*num_class1),
                                                   1: int(1.3*num_class2),
                                                   2: int(1.4*num_class3),
                                                   3: int(1.7*num_class4),
                                                   4: int(1.7*num_class5)})
    X_,_ = smote.fit_resample(X_,y_train_bins)
#     model = LGBMRegressor(random_state=27, verbose=500)#, max_depth=6, n_estimators=600)
    model.fit(X_[:,:-1], X_[:,-1])
    preds = np.exp(model.predict(x_test))
    
    score = np.sqrt(mean_squared_log_error(y_test, preds))
    print(score)
    scores.append(score)
print("Average: ", sum(scores)/len(scores))

Learning rate set to 0.046323
0:	learn: 0.8335673	total: 23.2ms	remaining: 23.2s
500:	learn: 0.3407032	total: 10.9s	remaining: 10.9s
999:	learn: 0.2932891	total: 23.5s	remaining: 0us
0.48695191744306526
Learning rate set to 0.046336
0:	learn: 0.8396687	total: 19.5ms	remaining: 19.5s
500:	learn: 0.3383543	total: 10.9s	remaining: 10.8s
999:	learn: 0.2879318	total: 21.6s	remaining: 0us
0.48861008512982856
Learning rate set to 0.046336
0:	learn: 0.8459203	total: 19.7ms	remaining: 19.7s
500:	learn: 0.3463794	total: 10.4s	remaining: 10.3s
999:	learn: 0.2937971	total: 23.4s	remaining: 0us
0.40928880308911303
Learning rate set to 0.04633
0:	learn: 0.8313428	total: 20.4ms	remaining: 20.3s
500:	learn: 0.3376197	total: 11.4s	remaining: 11.3s
999:	learn: 0.2869870	total: 23.6s	remaining: 0us
0.4669624077278755
Learning rate set to 0.046345
0:	learn: 0.8348115	total: 19.7ms	remaining: 19.7s
500:	learn: 0.3410871	total: 9.68s	remaining: 9.64s
999:	learn: 0.2954361	total: 19.2s	remaining: 0us
0.46825

In [None]:
# Catboost() CV1: 0.5771441890189664  CV2: 0.5672534125994342  Score: 0.57921
# Catboost() CV1: 0.48683637837005284  CV2: 0.4764979580938755
# Catboost() CV1:  0.4576250349953917  CV2:0.4446713988713829 Score:0.43458
#1000:0.44285958261312197
#500:0.4398350220581794

### Train Final model

In [34]:

X_ = np.concatenate((X_train,np.reshape(Y,(-1,1))),axis=1)

num_class1, num_class2 = Counter(Y_classes)[0],Counter(Y_classes)[1]
num_class3, num_class4 = Counter(Y_classes)[2],Counter(Y_classes)[3]
num_class5 = Counter(Y_classes)[4]
smote = RandomOverSampler(random_state=27, sampling_strategy={0: int(1.2*num_class1),
                                                   1: int(1.3*num_class2),
                                                   2: int(1.4*num_class3),
                                                   3: int(1.7*num_class4),
                                                   4: int(2*num_class5)})


X_,_ = smote.fit_resample(X_,Y_classes)
X_.shape

(2918, 1729)

In [36]:


model = CatBoostRegressor(random_state=27, verbose=500)
# model.fit(X_train, np.log(Y))
model.fit(X_[:,:-1], np.log(X_[:,-1]))
preds = np.exp(model.predict(X_test))

Learning rate set to 0.047227
0:	learn: 0.8304719	total: 20.9ms	remaining: 20.9s
500:	learn: 0.3418092	total: 10.2s	remaining: 10.2s
999:	learn: 0.2959748	total: 20.5s	remaining: 0us


In [37]:
preds[:10]

array([13407.21397175, 10086.2058145 , 13534.19506231, 44049.35631442,
        6769.29956538, 28627.281658  , 19896.47536505, 12107.60218017,
       13683.62002516, 17619.81339833])

### Make final submission

In [38]:
df_submit = pd.DataFrame({'Price': preds})
df_submit.to_excel("submit4.xlsx", index=False)

In [None]:
#0.49862