In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing the dataset

In [None]:
dataset = pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")

In [None]:
dataset.head()

### Variable Description
* Car_Name : Car name
* Year : model year of the vehicle
* Selling_Price : sale price of the vehicle
* Present_Price : current price of the vehicle
* Kms_Driven : vehicle mileage
* Fuel_Type : fuel type of the vehicle
* Seller_Type: dealer of the vehicle
* Transmission : Transmission type of the vehicle
* Owner : Owner

### Data Explorating (Veri Keşfi)

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
dataset.columns

In [None]:
# categorical features
dataset.select_dtypes(include="object").columns

In [None]:
# numerical features
dataset.select_dtypes(include=["int64","float64"]).columns

In [None]:
dataset.describe().T

### Missing Value

In [None]:
dataset.isna().values.any()

In [None]:
dataset.isna().sum()

In [None]:
dataset.head()

In [None]:
dataset = dataset.drop(columns='Car_Name')

In [None]:
# add to column
dataset['Current_Year'] = 2021

In [None]:
dataset.head()

In [None]:
dataset['Years_Old'] = dataset['Current_Year'] - dataset['Year']

In [None]:
dataset.head()

In [None]:
dataset = dataset.drop(columns = ['Current_Year', 'Year'])

In [None]:
dataset.head()

### Encoding the categorical data

In [None]:
dataset.select_dtypes(include='object').columns

In [None]:
for i in dataset.select_dtypes(include='object').columns:
    print(i,":",dataset[i].unique())

In [None]:
for i in dataset.select_dtypes(include='object').columns:
    print("{} : {}".format(i,dataset[i].nunique()))

In [None]:
# one hot encoding
dataset = pd.get_dummies(data=dataset, drop_first=True)

In [None]:
dataset.head()

### Correlation Matrix

In [None]:
dataset2 = dataset.drop(columns='Selling_Price')

In [None]:
dataset2.corrwith(dataset["Selling_Price"])

In [None]:
# Selling_Price ile diğer features'la arasındaki korelasyon (ilişki)
dataset2.corrwith(dataset["Selling_Price"]).plot.bar(figsize=(16,9), 
                                                     title='Corelated with Selling Price', 
                                                     grid = True
                                                    )
plt.show()

In [None]:
corr = dataset.corr()

In [None]:
# heatmap
plt.figure(figsize=(16,9))
sns.heatmap(corr, annot= True)
plt.show()

### Splitting the dataset

In [None]:
dataset.head()

In [None]:
# matrix of feature (independent variables)
X = dataset.drop(columns="Selling_Price")

# target variable (dependent variable)
y = dataset["Selling_Price"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [None]:
X_train.shape,X_test.shape, y_train.shape, y_test.shape

###  Building the Model

### Multiple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

### Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_forest_regressor = RandomForestRegressor()
random_forest_regressor.fit(X_train, y_train)

In [None]:
y_pred  = random_forest_regressor.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

### Find the optimial parameters  using RandomSearchCV (RandomSearchCV kullanarak en uygun parametreleri bulma)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
parameters = {
    'n_estimators':[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'criterion':['mse', 'mae'],
    'max_depth':[10, 20, 30, 40, 50],
    'min_samples_split':[2, 5, 10, 20, 50],
    'min_samples_leaf':[1, 2, 5, 10],
    'max_features':['auto', 'sqrt', 'log2']
}

In [None]:
parameters

In [None]:
random_cv = RandomizedSearchCV(estimator=random_forest_regressor, param_distributions=parameters, n_iter=10,
                               scoring='neg_mean_absolute_error', cv=5, verbose=2, n_jobs=-1)

In [None]:
random_cv.fit(X_train, y_train)

In [None]:
random_cv.best_estimator_

In [None]:
random_cv.best_params_

### Final Model (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100, min_samples_split= 2, min_samples_leaf=1, max_features='auto', criterion='mae', max_depth=30)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

### Predicting a single observation (Tek bir gözlemi tahmin etmek)

In [None]:
dataset.head()

In [None]:
# selling_Price = 8.25
single_obs = [[5.59, 27000, 0, 7, 0, 1, 0, 1]]

In [None]:
regressor.predict(single_obs)