# Data Collection

In [1]:
#modules for data collection and pre-processing
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv

#modules for visualization and analysis
import numpy as np
from sklearn.cross_validation import train_test_split
import seaborn as sns

sns.set_style("white")

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline



Module for scraping public store and output a csv of listing urls

In [2]:
def get_ads_urls():
    urls_list = []
    basic_url = "https://www.avito.ma/fr/maroc/voitures-à_vendre?mpr=500000000&o="
    for i in range(1,250):
        url = basic_url+str(i)
        r  = requests.get(url)
        data = r.text
        soup = BeautifulSoup(data, 'lxml')
        for div in soup.findAll('div', {'class': 'item-img'}):
            a = div.findAll('a')[0]
            urls_list.append(a.get('href'))


    df = pd.DataFrame(data={"url": urls_list})
    df.to_csv("./ads_urls.csv", sep=',',index=False)

In [3]:
get_ads_urls()

In [4]:
def scrap_ad_data(ad_url):
    r=requests.get(ad_url)
    data=r.text
    soup=BeautifulSoup(data,"html.parser")
    target_component=soup.findAll("h2",{"class":["font-normal", "fs12", "no-margin", "ln22"]})
    results=[]
    for i in target_component:
        results.append(''.join(i.findAll(text=True)).replace('\n',''))
    return results
    

In [5]:
def write_data_to_csv(data):
    with open("./output.csv","w") as f:
        writer=csv.writer(f)
        writer.writerows(data)

In [None]:
urls_data=pd.read_csv("./ads_urls.csv")
final_result=[]
i=1

for index, row in urls_data.iterrows():
    final_result.append(scrap_ad_data(row['url']))
    
print('Data scrap end')

write_data_to_csv(final_result)

# Data Preprocessing and cleaning

In [None]:
# set the column names
colnames=['price', 'year_model', 'mileage', 'fuel_type', 'mark', 'model', 'fiscal_power', 'sector', 'type', 'city'] 

df = pd.read_csv("./output.csv", encoding = "ISO-8859-1", sep=",", names=colnames, header=None)

df.head()

In [None]:
df= df[df.price.str.contains("DH")==True]
df.price=df.price.map(lambda x:x.rstrip('DH'))
df.price=df.price.str.replace(" ","")
df.price=pd.to_numeric(df.price, errors='coerce', downcast='integer')

df=df[df.year_model.str.contains("Année-Modèle")==True]
df.year_model=df.year_model.map(lambda x:x.lstrip('Année-Modèle:').rstrip('ou plus ancien'))
df=df[df.year_model != ' -']
df=df[df.year_model !='']
df.year_model=pd.to_numeric(df.year_model, errors='coerce', downcast='integer')

df=df[df.mileage.str.contains("Kilométrage")== True]
df.mileage = df.mileage.map(lambda x:x.lstrip('Kilométrage:'))
df.mileage = df.mileage.map(lambda x:x.lstrip('Plus de '))
df=df[df.mileage !='-']

if any(df.mileage != '500 000'):
    df['minim'],df['maxim']=df.mileage.str.split('-',1).str
    df['maxim']=df.maxim.str.replace(" ","")
    df['minim']=df.minim.str.replace(" ","")
    df['maxim']=df['maxim'].replace(np.nan, 500000)
    df.mileage = df.apply(lambda row: (int(row.minim)+int(row.maxim))/2, axis =1)
    df=df.drop(columns=['minim', 'maxim'])
    
df.fuel_type=df.fuel_type.map(lambda x:x.lstrip('Type de carburant:'))  

df['mark']=df['mark'].map(lambda x:x.replace('Marque:',''))
df=df[df.mark!='-']
df['model']=df['model'].map(lambda x:x.replace('Modèle:',''))

df.fiscal_power = df.fiscal_power.map(lambda x: x.lstrip('Puissance fiscale:Plus de').rstrip(' CV'))
df.fiscal_power =df.fiscal_power.str.replace("-","0")
df.fiscal_power=pd.to_numeric(df.fiscal_power, errors='coerce', downcast='integer')
df.fiscal_power=df.fiscal_power.map(lambda x:df.fiscal_power.mean() if x == 0 else x)

df=df[df.fuel_type!='-']

df=df.drop(columns=['sector','type'])

In [None]:
df.head

# Exploratory analysis and visualization

price distribution by year_model

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(df.price,df.year_model)
plt.xlabel("price (DH)",fontsize=14)
plt.ylabel("year of model",fontsize=14)
plt.title("Scatter plot of price and year of model",fontsize=18)
plt.show()

price distribution by mark

In [None]:
f, ax=plt.subplots(figsize=(15,12))
sns.stripplot(data=df, x='price', y='mark', jitter=.1)
plt.show

price distribution by fiscal power

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(df.price,df.fiscal_power,c='orange',marker='x')
plt.xlabel("price (DH)",fontsize=14)
plt.ylabel("fiscal power(cv)",fontsize=14)
plt.title("Scatter plot of price and fiscal power",fontsize=18)
plt.show()

top 20 mark distribution

In [None]:
plt.figure(figsize=(17,8))
df.mark.value_counts().nlargest(20).plot(kind='barh')
plt.xlabel('Marks Frequency')
plt.title("Frequency of Top 20 Marks distribution",fontsize=18)
plt.show()

price distribution by fuel type

In [None]:
f, ax =plt.subplots(figsize=(15,10))
sns.stripplot(data=df,x='fuel_type',y='price',jitter=.5)
plt.show()

In [None]:
f, ax=plt.subplots(figsize=(15,10))
sns.violinplot(data=df, x='fuel_type',y='price')
plt.show()

Price distribution by mileage and fuel type

In [None]:
color_dict={'Diesel':'blue','Essence':'orange','Electrique':'yellow','LPG':'magenta'}

In [None]:
fig, ax= plt.subplots(figsize=(15,10))
plt.plot(np.unique(df.year_model),np.poly1d(np.polyfit(df.year_model,df.price,1))(np.unique(df.year_model)),c='red',linewidth=1)
plt.scatter(df.year_model,df.price, c=[color_dict[i] for i in df.fuel_type],marker='+')
fuel_type=df.fuel_type.unique()
recs=[]
for i in fuel_type:
    recs.append(mpatches.Rectangle((2,2),1,1,fc=color_dict[i]))
    plt.legend(recs,fuel_type,loc=1,fontsize=16)
    
plt.title('Price of cars by year model grouped by fuel type',fontsize=20)
plt.ylabel('Price',fontsize=16)
plt.xlabel('year model', fontsize=16)
xvals=ax.get_xticks()
ax.set_xticklabels(['{}'.format(int(x)) for x in xvals])

yvals=ax.get_yticks()
ax.set_yticklabel(['{}'.format(int(y)) for y in yvals])

plt.show()

correlation matrix

In [None]:
cmap=sns.diverging_palette(220, 10, as_cmap=True)
f, ax =plt.subplots(figsize=(15,10))
corr=df.corr()
mask=np.zero_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)]=True
sns.heatmap(corr,mask=mask,cmap=cmap,vmax=.3,center=0,square=True,linewidths=.5,cbar_kws={"shrink":.5})
plt.title('Correlation matrix',fontsize=20)
plt.show()

ads distribution by city

In [None]:
plt.figure(figsize=(17,8))
df.city.value_counts().nlargest(20).plot(kind='bar')
plt.xlabel('City Frequency')
plt.title("Frequency of top 20 city distribution",fontsize=18)
plt.show()

# Data Modelling

knn regression

In [None]:
data=df[df.price < 400000]

In [None]:
df.head()

handle categorical features

In [None]:
X = data[['year_model','mileage','fiscal_power','fuel_type','mark']]
Y = data.price
X = pd.get_dummies(data=X)

In [None]:
X.head()

Data splitting - Train, validation + test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=.20, random_state =42)

In [None]:
from sklearn import neighbors
knn=neighbors.KNeighborsRegressor(n_neighbors=6)
knn.fit(X_train, Y_train)

predicted = knn.predict(X_test)
residual=Y_test- predicted

fig=plt.figure(figsize=(30,30))
ax1=plt.subplot(211)
sns.distplot(residual, color='teal')
plt.tick_params(axis='both',which='major',labelsize=20)
plt.title('Residual counts',fontsize=35)
plt.xlabel('Residual',fontsize=25)
plt.ylabel('Count',fontsize=25)

ax2=plt.subplot(212)
plt.scatter(predicted, residual, color='teal')
plt.tick_params(axis='both',which='major',labelsize=20)
plt.xlabel('Predicted',fontsize=25)
plt.ylabel('Residual',fontsize=25)
plt.axhline(y=0)
plt.title('Residual vs. Predicted',fontsize=35)

plt.show()

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(Y_test, predicted))
print('RMSE:')
print(rmse)

In [None]:
from sklearn.metrics import r2_score
print('Variance score: %.2f'%r2_score(Y_test,predicted))

In [None]:
rmse_l=[]
num=[]
for n in range (2,16):
    knn=neighbors.KNeighborsRegressor(n_neighbors=n)
    knn.fit(X_train,Y_train)
    predicted=knn.predict(X_test)
    rmse_l.append(np.sqrt(mean_squared_error(Y_test,predicted)))
    num.append(n)

In [None]:
df_plt=pd.DataFrame()
df_plt['rmse']=rmse_l
df_plt['n_neighbors']=num
ax=plt.figure(figsize=(15,7))
sns.barplot(data=df_plt, x='n_neighbors',y='rmse')
plt.show()

Decision tree regression

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr=DecisionTreeRegressor(max_features='auto')
dtr.fit(X_train, Y_train)
predicted=dtr.predict(X_test)
residual=Y_test-predicted

fig=plt.figure(figsize=(30,30))
ax1=plt.subplot(211)
sns.distplot(residual,color='orange')
plt.ticktick_params(axis='both',which='major',labelsize=20)
plt.title('Residual counts',fontsize=35)
plt.xlabel('Residual',fontsize=25)
plt,ylabel('Count',fontsize=25)

ax2=plt.subplot(212)
plt.scatter(predicted, residual, color='orange')
plt.tick_params(axis='both',which='major',labelsize=20)
plt.xlabel('Predicted',fontsize=25)
plt.ylabel('Residual',fintsize=25)
plt.axhline(y=0)
plt.title('Residual vs Predicted',fontsize=35)

plt.show()

from sklearn.metrics import mean_squarred_error
rmse=np.sqrt(mean_squared_error(Y_test,predicted))
print("RMSE: ")
print(rmse)

In [None]:
print('Variance score: %.2f'% r2_score(Y_test,predicted))

Linear Regression

In [None]:
from sklearn import linear_model

regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train)

predicted = regr.predict(X_test)
residual = Y_test - predicted

fig = plt.figure(figsize=(30,30))
ax1 = plt.subplot(211)
sns.distplot(residual, color ='teal')
plt.tick_params(axis='both', which='major', labelsize=20)
plt.title('Residual counts',fontsize=35)
plt.xlabel('Residual',fontsize=25)
plt.ylabel('Count',fontsize=25)

ax2 = plt.subplot(212)
plt.scatter(predicted, residual, color ='teal')
plt.tick_params(axis='both', which='major', labelsize=20)
plt.xlabel('Predicted',fontsize=25)
plt.ylabel('Residual',fontsize=25)
plt.axhline(y=0)
plt.title('Residual vs. Predicted',fontsize=35)

plt.show()

from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(Y_test, predicted))
print('RMSE:')
print(rmse)

In [None]:
print('Variance score: %.2f' % r2_score(Y_test, predicted))

Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

r_sq = []
deep = []
mean_scores = []

#loss : {‘ls’, ‘lad’, ‘huber’, ‘quantile’}
for n in range(3, 11):
    gbr = GradientBoostingRegressor(loss ='ls', max_depth=n)
    gbr.fit (X, Y)
    deep.append(n)
    r_sq.append(gbr.score(X, Y))
    mean_scores.append(cross_val_score(gbr, X, Y, cv=12).mean())

In [None]:
plt_gbr = pd.DataFrame()

plt_gbr['mean_scores'] = mean_scores
plt_gbr['depth'] = deep
plt_gbr['R²'] = r_sq

f, ax = plt.subplots(figsize=(15, 5))
sns.barplot(data = plt_gbr, x='depth', y='R²')
plt.show()

f, ax = plt.subplots(figsize=(15, 5))
sns.barplot(data = plt_gbr, x='depth', y='mean_scores')
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

gbr = GradientBoostingRegressor(loss ='ls', max_depth=6)
gbr.fit (X_train, Y_train)
predicted = gbr.predict(X_test)
residual = Y_test - predicted

fig = plt.figure(figsize=(30,30))
ax1 = plt.subplot(211)
sns.distplot(residual, color ='teal')
plt.tick_params(axis='both', which='major', labelsize=20)
plt.title('Residual counts',fontsize=35)
plt.xlabel('Residual',fontsize=25)
plt.ylabel('Count',fontsize=25)

ax2 = plt.subplot(212)
plt.scatter(predicted, residual, color ='teal')
plt.tick_params(axis='both', which='major', labelsize=20)
plt.xlabel('Predicted',fontsize=25)
plt.ylabel('Residual',fontsize=25)
plt.axhline(y=0)
plt.title('Residual vs. Predicted',fontsize=35)

plt.show()

rmse = np.sqrt(mean_squared_error(Y_test, predicted))
scores = cross_val_score(gbr, X, Y, cv=12)

print('\nCross Validation Scores:')
print(scores)
print('\nMean Score:')
print(scores.mean())
print('\nRMSE:')
print(rmse)

In [None]:
print('Variance score: %.2f' % r2_score(Y_test, predicted))

Prediction VS Real price histogram

In [None]:
A = Y_test.reshape(-1, 1)
B = predicted.reshape(-1, 1)

In [None]:
plt.rcParams['figure.figsize'] = 16,5
plt.figure()
plt.plot(A[-100:], label="Real")
plt.plot(B[-100:], label="Predicted")
plt.legend()
plt.title("Price: real vs predicted")
plt.ylabel("price [DH]")
plt.xticks(())
plt.show()

predict new

In [None]:
# user_input = [2010, 124999.5, 6, 'Diesel', 'BMW']
user_input = {'year_model':2006, 'mileage':82499.5, 'fiscal_power':6, 'fuel_type':'Diesel', 'mark':'Dacia'}
def input_to_one_hot(data):
    # initialize the target vector with zero values
    enc_input = np.zeros(61)
    # set the numerical input as they are
    enc_input[0] = data['year_model']
    enc_input[1] = data['mileage']
    enc_input[2] = data['fiscal_power']
    ##################### Mark #########################
    # get the array of marks categories
    marks = df.mark.unique()
    # redefine the the user inout to match the column name
    redefinded_user_input = 'mark_'+data['mark']
    # search for the index in columns name list 
    mark_column_index = X.columns.tolist().index(redefinded_user_input)
    #print(mark_column_index)
    # fullfill the found index with 1
    enc_input[mark_column_index] = 1
    ##################### Fuel Type ####################
    # get the array of fuel type
    fuel_types = df.fuel_type.unique()
    # redefine the the user inout to match the column name
    redefinded_user_input = 'fuel_type_'+data['fuel_type']
    # search for the index in columns name list 
    fuelType_column_index = X.columns.tolist().index(redefinded_user_input)
    # fullfill the found index with 1
    enc_input[fuelType_column_index] = 1
    return enc_input

In [None]:
print(input_to_one_hot(user_input))

In [None]:
a = input_to_one_hot(user_input)

In [None]:
price_pred = gbr.predict([a])

In [None]:
price_pred[0]

save model

In [None]:
from sklearn.externals import joblib

joblib.dump(gbr, 'model.pkl')

In [None]:
gbr = joblib.load('model.pkl')

In [None]:
print("the best price for this Dacia is",gbr.predict([a])[0])

Build RESI API

In [None]:
import requests, json
url = "http://127.0.0.1:8080/api"
data = json.dumps({'year_model':2014, 'mileage':12499.5, 'fiscal_power':7, 'fuel_type':'Diesel', 'mark':'Mercedes-Benz'})

r = requests.post(url, data)

print(r.json())



In [None]:
r.json()['results'][0]