In [1]:
from typing import Optional, Any, Set
from numbers import Number
import json
from pathlib import Path
from difflib import SequenceMatcher

from pandarallel import pandarallel
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from pylab import rcParams

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

In [2]:
%%capture
cd ..

In [3]:
import similarity as sim
import constants as const
import cleaner

In [4]:
tqdm.pandas()
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [6]:
df_test_raw = pd.read_csv(const.TEST_PATH)
df_train_raw = pd.read_csv(const.TRAIN_PATH)
df_train_raw.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price,price
0,1030324,BMW 3 Series 320i Gran Turismo M-Sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,,09-dec-2013,luxury sedan,"parf car, premium ad car, low mileage car",...,73000.0,45330.0,50462.0,,,uncategorized,"5 doors gt, powerful and fuel efficient 2.0l t...","bmw i-drive, navigation, bluetooth/aux/usb inp...",,71300.0
1,1021510,Toyota Hiace 3.0M,,hiace,high loan available! low mileage unit. wear an...,2014.0,,26-jan-2015,van,premium ad car,...,110112.0,27502.0,1376.0,,25-jan-2035,uncategorized,low mileage unit. well maintained vehicle. vie...,factory radio setting. front recording camera....,,43800.0
2,1026909,Mercedes-Benz CLA-Class CLA180,mercedes-benz,cla180,1 owner c&c unit. full agent service with 1 mo...,2016.0,,25-jul-2016,luxury sedan,"parf car, premium ad car",...,80000.0,27886.0,26041.0,,,uncategorized,responsive and fuel efficient 1.6l inline 4 cy...,dual electric/memory seats. factory fitted aud...,,95500.0
3,1019371,Mercedes-Benz E-Class E180 Avantgarde,mercedes-benz,e180,"fully agent maintained, 3 years warranty 10 ye...",2019.0,,17-nov-2020,luxury sedan,"parf car, almost new car, consignment car",...,9800.0,46412.0,56977.0,,,uncategorized,"1.5l inline-4 twin scroll turbocharged engine,...",64 colour ambient lighting. active parking ass...,,197900.0
4,1031014,Honda Civic 1.6A VTi,,civic,"kah motor unit! 1 owner, lowest 1.98% for full...",2019.0,,20-sep-2019,mid-sized sedan,parf car,...,40000.0,20072.0,20101.0,,,uncategorized,"1.6l i-vtec engine, 123 bhp, earth dreams cvt ...","s/rims, premium leather seats, factory touch s...",,103200.0


In [10]:
df_train = cleaner.clean_preliminary(df_train_raw)
df_test = cleaner.clean_preliminary(df_test_raw, is_test=True)

### Train

In [11]:
%%time

if const.GENERATE_SIM_DF:
    # Wall time: 3h 59min 32s
    sim_df = sim.compute_similarities(df_train, df_train)
    sim_df.to_pickle(const.MOST_SIMILIAR_TRAIN_PATH)
else:
    sim_df = pd.read_pickle(const.MOST_SIMILIAR_TRAIN_PATH)

CPU times: user 1.34 ms, sys: 355 ms, total: 357 ms
Wall time: 356 ms


In [12]:
replaced_train = sim.replace_nan_with_most_similar(df_train, sim_df=sim_df, verbose=True)

Getting top k most similar...


Computing weights: 100%|██████████| 5219/5219 [00:02<00:00, 1801.20it/s]
Replacing NaN rows: 100%|██████████| 5219/5219 [00:59<00:00, 88.17it/s]


In [None]:
# replaced_train.to_csv('data/train_sim_filled.csv')

### Test

In [13]:
%%time

if const.GENERATE_SIM_DF:
    # Wall time: 2h 39min 6s
    sim_df_test = sim.compute_similarities(df_test, df_test)
    sim_df_test.to_pickle(const.MOST_SIMILIAR_TEST_PATH)
else:
    sim_df_test = pd.read_pickle(const.MOST_SIMILIAR_TEST_PATH)

CPU times: user 1.37 ms, sys: 378 ms, total: 380 ms
Wall time: 379 ms


In [14]:
replaced_test = sim.replace_nan_with_most_similar(df_test, df_train, sim_df_test, verbose=True)

Getting top k most similar...


Computing weights: 100%|██████████| 5000/5000 [00:03<00:00, 1634.40it/s]
Replacing NaN rows: 100%|██████████| 5000/5000 [00:37<00:00, 133.35it/s]


In [None]:
# replaced_test.to_csv('data/test_sim_filled.csv')