In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
import httpx
from bs4 import BeautifulSoup
import re
import time

In [4]:
#original_reg_date, reg_date, lifespan, fuel_type, opc_scheme

In [5]:
def handle_date_fields(dataF):
    df = dataF.copy()
    
    '''
    * Agglomerating a singular 'registered_date' field with all values populated.
        - Removing 1 row with registered date in the future
    * Removing 'lifespan' since it has low data frequency
        - ~1500 rows with 'lifespan' - 'registered_date' as 7304
        - 22 rows with 'lifespan' - 'registered_date' other than 7304 but greater (and unique)
        - Alternative approach: Set other vehicles lifespan as 7304 which is the median and most frequent entry
    * Adding a new column for 'car_age'
    '''
    
    df.lifespan = pd.to_datetime(df.lifespan)
    df.reg_date = pd.to_datetime(df.reg_date)
    df.original_reg_date = pd.to_datetime(df.original_reg_date)

    # Fixing NaNs across original_reg_date, reg_date by adding a new column
    df["registered_date"] = df.reg_date.fillna(df.original_reg_date)
    df = df.drop(columns=['reg_date', 'original_reg_date'])
    df = df.drop(df[df.registered_date > datetime.now()].index)
    
    df = df.drop(columns=['lifespan'])
    # Alternative 
#     df.lifespan = df.lifespan.fillna(df.registered_date + pd.Timedelta(days=7304))
    
    # Remember to remove a row with manufactured as 2925 (bad value)
    df["car_age"] = datetime.now().year - df.manufactured
    df = df.drop(df[(df.car_age > 50) | (df.car_age < 0)].index)
    
    return df

In [6]:
def handle_opc(dataF):
    df = dataF.copy()
    
    '''
    Replacing NaN values with 0 denoting "Non-OPC" vehicles.
    Replacing values with 1 denoting "OPC" vehicles
    '''
    df.opc_scheme = df.opc_scheme.fillna("0")
    df.loc[~df.opc_scheme.isin(["0"]), "opc_scheme"] = "1"
    
    return df

In [7]:
def handle_make(dataF):
    df = dataF.copy()
    
    '''
    Upon checking it's found that ALL ROWS HAVE model
    But not all rows have make available which can be extracted from Title
    '''
    
#     title = df[df.make.isna()].title
#     revlen = [i[0] for i in sorted(title.str.split(" "), key = lambda x: len(x), reverse=True)]
    df.make = df.make.fillna((df.title.apply(str.lower).str.split(" ")).str[0])
    
    return df
    
    

In [None]:
def handle_fuel_type(dataF):
    df = dataF.copy()
    
    '''
    Using extracted fueltype.csv values from WebScraping to fill na values in the dataset
    '''
    
    fueltype = pd.read_csv("fueltype.csv")
    fueltype.set_index("Unnamed: 0")
    
    df.fuel_type = df.fuel_type.fillna(fueltype.fuel)
    
    return df

In [None]:
def get_carCode_from_listing(listing_id):
    try:
        r = httpx.get("https://www.sgcarmart.com/used_cars/info.php?ID="+str(listing_id))
        content = BeautifulSoup(r.content, 'html.parser')
        parent = content.find(class_="twoRow_info")
        link = parent.parent.find('a')
        carCode = re.search("CarCode=(.+?)\"",str(link)).group(1).replace("'","")
    except Exception as e:
#         print(str(e))
        carCode = r.content if r.status_code == 200 else "MISSING"
#     print(carCode)
    return carCode

In [None]:
subCode_data ={}

In [None]:
def get_subcode_from_carCode(carCode):
    try:
        if carCode == "MISSING":
            return "MISSING"
        if carCode in subCode_data:
            return subCode_data[carCode]
        r = httpx.get("https://www.sgcarmart.com/new_cars/newcars_specs.php?CarCode="+str(carCode))
        content = BeautifulSoup(r.content, 'html.parser')
        listElement = content.find(id="submodels_ul_link").find_all('a')
    except Exception as e:
        subCode_data[carCode] = content
        return content
    subCode_data[carCode] = listElement
    return listElement

In [None]:
def get_all_carCodes_subCarCodes(dataF):
    df = dataF.copy()
    df["CarCode"] = df.listing_id.apply(get_carCode_from_listing)
    
    return df

In [None]:
df = pd.read_csv("train.csv")
df = handle_date_fields(df)
df = handle_opc(df)
df = handle_make(df)
a = time.time()
# dh = get_all_carCodes_subCarCodes(df)
print(time.time()-a)
len(df)

In [None]:
df.fuel_type.value_counts()

In [None]:
df = handle_fuel_type(df)
df.fuel_type.value_counts()

In [None]:
fuel_list = []
for i in dic_f.items():
    fuel_list.append((i[0], get_fuel_value(i[1])))

In [None]:
dh = pd.read_csv("buffer_carCode.csv")
dh.loc[(~dh.CarCode.str.isnumeric().fillna(False)), "CarCode"] = "MISSING"
dh = dh.set_index("Unnamed: 0")
dh[["CarCode"]].to_csv("carcodes.csv")

In [None]:
dhf = pd.read_csv("carcodes.csv")

In [None]:
a = time.time()
dhf["Subcode"] = dhf.CarCode.apply(get_subcode_from_carCode)
print(time.time()-a)

In [None]:
dic ={}
def get_fuel_type(carCode, subCode, fuel_type):
#     print(carCode, subCode, fuel_type)
    if fuel_type == "NOT" or carCode == "MISSING" or subCode == []:
        return "NOT"
    try:
        if (carCode, subCode[0][0]) in dic:
            return dic[(carCode, subCode[0][0])]
        query = "https://www.sgcarmart.com/new_cars/newcars_specs.php?CarCode="+str(carCode)+"&amp;Subcode="+str(subCode[0][0])
        r = httpx.get(query)
#         print(query)
        content = BeautifulSoup(r.content, 'html.parser')
        fuel_type = content.find('td', text='Fuel type').nextSibling.nextSibling.text
        dic[(carCode, subCode[0][0])] = fuel_type
    except Exception as e:
        print(carCode, subCode, query)
        fuel_type = "MISSING"
    
    return fuel_type

In [None]:
import ast
# dhf = pd.read_csv("carcodes_subcodes.csv")
dhf = dhf.set_index("Unnamed: 0")
dhf["title"] = df.title
dhf["fuel_type"] = ""
dhf["ExtractedSubcode"] = ""
dhf.ExtractedSubcode = dhf.Subcode.apply(lambda x: extract_subcode_and_model(x))
# dhf.Subcode = dhf.Subcode.apply(lambda x: x.split(","))
dhf.loc[dhf.ExtractedSubcode.apply(len) == 1,"fuel_type"] = "CAN"
dhf.fuel_type = dhf.fuel_type.replace("", "NOT")

In [None]:
dhf.fuel_type.value_counts()

In [None]:
extract_subcode_and_model(dhf.loc[0,"Subcode"])

In [None]:
def extract_subcode_and_model(entry):
    l =[]
    for i in entry:
        r = re.search("Subcode=(.+?)\".*>(.+?)</a>",str(i))
        if r:
            l.append((r[1], r[2]))
    
    return l

In [None]:
dhf

In [None]:
dhf.loc[0,"ExtractedSubcode"][0][0]

In [None]:
a = time.time()
dhf.fuel_type = dhf.apply(lambda x: get_fuel_type(x.CarCode, x.ExtractedSubcode, x.fuel_type), axis=1)
print(time.time()-a)

In [None]:
dic_f ={}
def get_fuel_type_only_on_carCode(carCode):
    if carCode == "MISSING":
        return "NOT"
    try:
        if carCode in dic_f:
            return dic_f[carCode]
        query = "https://www.sgcarmart.com/new_cars/newcars_specs.php?CarCode="+str(carCode)
#         +"&amp;Subcode="+str(subCode[0][0])
        r = httpx.get(query)
#         print(query)
        content = BeautifulSoup(r.content, 'html.parser')
        fuel_type = content.find('td', text='Fuel type').nextSibling.nextSibling.text
        dic_f[carCode] = fuel_type
    except Exception as e:
        print(query)
        fuel_type = "MISSING"
    
    return fuel_type

In [None]:
dhf["fuel"] = dhf.apply(lambda x: get_fuel_type_only_on_carCode(x.CarCode), axis=1)

In [None]:
dhf.fuel.value_counts()

In [None]:
extracted_fuel_data = dhf[["fuel"]]

In [None]:
def get_fuel_value(x):
    if 'petrol-electric' in str.lower(x):
        return "petrol-electric"
    if 'diesel' in str.lower(x):
        return 'diesel'
    if 'electric' in str.lower(x):
        return 'electric'
    return 'petrol'

In [None]:
extracted_fuel_data

In [None]:
extracted_fuel_data.fuel = extracted_fuel_data.apply(lambda x: get_fuel_value(x.fuel), axis=1)

In [None]:
extracted_fuel_data.fuel.value_counts()

In [None]:
extracted_fuel_data.to_csv("fueltype.csv")

In [None]:
############################## Experiment/EDA below this ################################

tit = df.title
revlen = sorted(tit.str.split(" "), key = lambda x: len(x), reverse=True)
f = set([i[0] for i in revlen])
len(f)

## To check with similarity setup whether it's populating correctly or not.

df[df.title == " ".join(revlen[0])][:3]

title_fuel_type_list = df[["title","fuel_type"]].copy()

from googleapiclient.discovery import build
from time import sleep
from random import randint

my_api_key = "AIzaSyDlfOSYgkCMa7hUq798A2pRTQEc_EXuijo" #The API_KEY you acquired
my_cse_id = "13fd9c6e7d64ca4dc" #The search-engine-ID you created


def google_search(search_term, api_key, cse_id, **kwargs):
    ret_val = None
    try:
        service = build("customsearch", "v1", developerKey=api_key)
        res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
        ret_val = res['items']
    except Exception as e:
        ret_val = "ERROR" + str(e)
    return ret_val

def find_fuel_type(title):
    full_query = title + " sgcarmart newcars_specs.php fuel type"
    res = google_search(full_query, my_api_key, my_cse_id, num =1)
    sleep(randint(500,1000)/1000)
    if str(res).startswith('ERROR'):
        return res
    return res[0]
find_fuel_type("DFSDFSDF")

results = []

for i in range(150):
    results.append(find_fuel_type("Mercedes-Benz E-Class E180 Avantgarde"))
    print(i)

google_search("dfg newcars_specs.php fuel type", my_api_key, my_cse_id, num=1)
find_fuel_type("DFSDFSDF")
fuel_type_scraped_list = []
for i in np.array(title_fuel_type_list.index):
    try:
        row = title_fuel_type_list.loc[[i]]
    #     print(type(row))
    #     print(row.fuel_type.iloc[-1])
        title = row.title.iloc[-1]
        fuel_type = "MISSING"
        fuel_type = row.fuel_type.iloc[-1] if ~row.fuel_type.isna().iloc[-1] else find_fuel_type(title)
        fuel_type_scraped_list.append([i, title, fuel_type])
    except:
        print("Error at "+i)
        break

In [None]:
import requests
from random import randint
from time import sleep

c = 0 
for i in range(1000):
    r = httpx.get("http://www.sgcarmart.com/new_cars/newcars_specs.php?CarCode=12315")
    sleep(randint(50,100)/100)
    if str(r.status_code) == "200":
        c += 1
print(c)

In [None]:
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
r = requests.get("http://www.sgcarmart.com/new_cars/newcars_specs.php?CarCode=12315", headers)

In [None]:
import httpx
from bs4 import BeautifulSoup

r = httpx.get("http://www.sgcarmart.com/new_cars/newcars_specs.php?CarCode=12315")
BeautifulSoup(r.content, 'html.parser').find(id="submodels_ul_link").find_all('a')

In [None]:
import time
a = time.time()
sleep(randint(50,100)/100)
print(time.time()-a)
print("122")

In [None]:
def get_all_carCodes_subCarCodes(dataF):
    df = dataF.copy()
    df["CarCode"] =""
    df['Subcode'] = ""
    
    df.CarCode.apply(lambda x: 11)
    
    return df
    

In [None]:
def get_carCode_from_listing(listing_id):
    r = httpx.get("https://www.sgcarmart.com/used_cars/info.php?ID="+listing_id)
    content = str(BeautifulSoup(r.content, 'html.parser').find(class_="twoRow_info").parent.find('a'))
    carCode = re.search("CarCode=(.+?)\"",content).group(1).replace("'","")
    return carCode

In [None]:
r = httpx.get("https://www.sgcarmart.com/used_cars/info.php?ID="+str(1021510))
content = str(BeautifulSoup(r.content, 'html.parser').find(class_="twoRow_info").parent.find('a'))
carCode = re.search("CarCode=(.+?)\"",content).group(1).replace("'","")


In [None]:
content

In [None]:
import re


In [None]:
r = httpx.get("https://www.sgcarmart.com/used_cars/info.php?ID="+str("1034412"))

In [None]:
type(r.status_code)

In [None]:
f = pd.read_csv("train.csv")

In [None]:
f[f.lifespan.isna()]

In [8]:
import smtplib, ssl

port = 465  # For SSL
password = "cancanlah"
user = "itsjustanemailfor@gmail.com"
sender_email = user  # Enter your address
receiver_email = "sdphaye@gmail.com"  # Enter receiver address
message = """\
Subject: Hi there

Run completed"""

# Create a secure SSL context
context = ssl.create_default_context()

with smtplib.SMTP_SSL("smtp.gmail.com", port, context=context) as server:
    server.login(user, password)
    server.sendmail(sender_email, receiver_email, message)