In [None]:
import numpy as np
import pandas as pd

# Data preprocessing. 
Firstly, let's make all tables into one and process it.

In [None]:
cars_data =[
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/audi.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/bmw.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/cclass.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/focus.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/ford.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/hyundi.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/merc.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/skoda.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/toyota.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/unclean cclass.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/unclean focus.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/vauxhall.csv"),
    pd.read_csv("../input/used-car-dataset-ford-and-mercedes/vw.csv")
]

Data is read. **Now let's modify read tables to make it all in one table**. Firstly, **we define needed functions**.

In [None]:
def select_common_columns(lst=[pd.DataFrame()], last=(cars_data[::-1])[0]):
    if(len(lst)):
        return set(lst[0].columns) & select_common_columns(lst[1:])
    else:
        return set(last)

In [None]:
def modifyTables(lst=[pd.DataFrame()], cols=[]):
    if(len(lst)):
        lst[0] = lst[0][cols]
        modifyTables(lst[1:], cols)
    else:
        pass

In [None]:
def writeMark(lst=[pd.DataFrame()], car_marks=[]):
    if( len(lst) and len(car_marks) ):
        lst[0]["mark"] = car_marks[0]
        writeMark(lst[1:], car_marks[1:])
    else:
        pass

In [None]:
def calcGeneralLength(lst=[pd.DataFrame()]):
    if(len(lst)):
        return len(lst[0]) + calcGeneralLength(lst[1:])
    else:
        return 0

In [None]:
def resultColumn(lst=[pd.DataFrame()], col=""):
    if(len(lst)):
        return lst[0][col].to_list() + resultColumn(lst[1:], col)
    else:
        return []

**Now we modify data tables.** Firstly, let's find common columns.

In [None]:
commonColumns = select_common_columns(cars_data)
print(commonColumns)

In [None]:
modifyTables(cars_data, commonColumns)

Now we will write a mark of cars for data tables.

In [None]:
marks = ["audi", "bmw", "cclass", "focus", "ford", "hyundy", "merc", "skoda", "toyota", "unclean cclass", "unclean focus", 
         "vauxhall", "vw"]

In [None]:
writeMark(cars_data, marks)

In [None]:
cars_data[0].head()

Now we build general data package.

In [None]:
data = pd.DataFrame(columns=cars_data[0].columns, index=range(calcGeneralLength(cars_data)))
data.head()

In [None]:
for column in data.columns:
    data[column] = resultColumn(cars_data, column)
data.head()

In [None]:
data = data.dropna()
data.head()
print(len(data))

# Building a prediction model. 
Data package is ready. Now let's do final manipulations with data and try to build prediction model.

In [None]:
models        = list( data["model"].unique() )        #to code models of cars
transmissions = list( data["transmission"].unique() ) #in code transmission types

In [None]:
for model in models:
    data["model"][ data["model"]==model ] = models.index(model)
for transmission in transmissions:
    data["transmission"][ data["transmission"]==transmission ] = transmissions.index(transmission)
for mark in marks:
    data["mark"][ data["mark"]==mark ] = marks.index(mark)
data.head()

In [None]:
data.dtypes

In [None]:
indexes = []
for i in data.index:
    if( type(data.loc[i]["price"]) not in [int, float] or type(data.loc[i]["model"]) not in [int, float] or
        type(data.loc[i]["mileage"]) not in [int, float] or type(data.loc[i]["transmission"]) not in [int, float] or
        type(data.loc[i]["mark"]) not in [int, float]
      ):
        indexes.append(i)
data = data.drop(indexes)
print(len(data))

In [None]:
data.astype("float")

Ok, all parameters have numerical variant. **Let's build prediction model**.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
X = data.drop("price", axis=1)
y = data["price"]

Firstly, we will build linear regression model and will test it.

In [None]:
predModelDeg1 = LinearRegression().fit(X, y)

In [None]:
print( "MSE: "+str(metrics.mean_squared_error(y, predModelDeg1.predict(X))) )

In [None]:
print( "R^2: "+str(metrics.r2_score(y, predModelDeg1.predict(X))) )

As we see, linear model is not very good. So, let's build quadratic regression model and test it.

In [None]:
predModelDeg2 = LinearRegression().fit(PolynomialFeatures(degree=2).fit_transform(X), y)

In [None]:
print( "MSE: "+str(metrics.mean_squared_error(y, predModelDeg2.predict(PolynomialFeatures(degree=2).fit_transform(X)))) )

In [None]:
print( "R^2: "+str(metrics.r2_score(y, predModelDeg2.predict(PolynomialFeatures(degree=2).fit_transform(X)))) )

Ok, we see implovement of results. Will we see improvement if we will use cubic regression model?

In [None]:
predModelDeg3 = LinearRegression().fit(PolynomialFeatures(degree=3).fit_transform(X), y)

In [None]:
print( "MSE: "+str(metrics.mean_squared_error(y, predModelDeg3.predict(PolynomialFeatures(degree=3).fit_transform(X)))) )

In [None]:
print( "R^2: "+str(metrics.r2_score(y, predModelDeg3.predict(PolynomialFeatures(degree=3).fit_transform(X)))) )

We see improvement again. But is it enough? I think no. So let's try to experiment with neural networks.

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
predNNModel = MLPRegressor((25, 25), activation="tanh", max_iter=500).fit(X, y)

In [None]:
print( "MSE: "+str(metrics.mean_squared_error( y, predNNModel.predict(X) )) )

In [None]:
print( "R^2: "+str(metrics.r2_score( y, predNNModel.predict(X) )) )

In [None]:
print( "MSLE: "+str(metrics.mean_squared_log_error( y, predNNModel.predict(X) )) )

As we see, it's possible, that prediction model, based in neural network can be enough good to be used, if we use our data. So let's build more bulky newural network to see, is it true or no.

In [None]:
predNNModel = MLPRegressor((50, 50, 50), activation="tanh", max_iter=1500).fit(PolynomialFeatures(degree=2).fit_transform(X), y)

In [None]:
print( "MSE: "+str(metrics.mean_squared_error( y, predNNModel.predict(PolynomialFeatures(degree=2).fit_transform(X)) )) )

In [None]:
print( "R^2: "+str(metrics.r2_score( y, predNNModel.predict(PolynomialFeatures(degree=2).fit_transform(X)) )) )

In [None]:
print( "MSLE: "+str(metrics.mean_squared_log_error( y, predNNModel.predict(PolynomialFeatures(degree=2).fit_transform(X)) )) )

As we see, we can try to build neural network as prediction model to predict price and this model can be not very worse then regression model.