In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Load and Check Data

In [None]:
df=pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")

In [None]:
df.head()

In [None]:
df.shape

* There are 301 sample and 9 features in dataset.


In [None]:
list=["Seller_Type","Transmission","Owner","Fuel_Type"]
for i in list:
    print(df[i].unique())

* As we can see above; Seller_type, Transmission, Owner, Fuel_Type are categorical variables.

In [None]:
#we should check missing and null values
df.isnull().sum()

* There is no missing or null values in dataset

In [None]:
df.describe()

In [None]:
df.columns

## Exploratory Data Analysis (EDA)

In [None]:
final_dataset=df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven','Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]
#With this i drop Car_Name feature
#You can use drop function. This is a just another way to drop features.

In [None]:
final_dataset.head()

In [None]:
final_dataset["Current_Year"]=2021
final_dataset.head()


* The year feature shows us what year the cars were built. We will subtract 2021 from the years when cars were built to reach the age of cars. We will change the year feature with new feature which we will calculate

In [None]:
final_dataset["AgeCar"]=final_dataset["Current_Year"] - final_dataset["Year"]

In [None]:
final_dataset.head()

In [None]:
final_dataset.drop(["Year"],inplace=True,axis=1)


In [None]:
final_dataset.head()

In [None]:
#I used current year feature, and now i will get rid of it.
final_dataset.drop(["Current_Year"],inplace=True,axis=1)

In [None]:
final_dataset.head()

* we will drop one of category of fuel_type to get_dummies function.
* So, we will have two category, when both of them are zero, it represent the third category.
* This method just an advice.

In [None]:
final_dataset=pd.get_dummies(final_dataset,drop_first=True)
#The get_dummies() function is used to convert categorical variable into dummy/indicator variables.

In [None]:
final_dataset.head()

In [None]:
sns.pairplot(final_dataset)

## Visualization

In [None]:
corr=final_dataset.corr()
top_corr_features=corr.index
plt.figure(figsize=(10,10))

g=sns.heatmap(final_dataset[top_corr_features].corr(),annot=True,cmap="RdYlGn")
#corr uses pearson correlation

* if two features high correlated we shoul drop one of them because they do same work.

### selling price - present price

In [None]:
#Plot the relationship between two variables in a DataFrame with regplot
sns.regplot(x="Selling_Price",y="Present_Price",data=final_dataset)

### selling price - fuel type

In [None]:
sns.barplot("Fuel_Type","Selling_Price",data=df,palette="flare")
# i used df instead of final_dataset in order to can use Fuel_type feature

* Cars have highest price which have diesel as fuel type. The rates of price of Petrol and Cng almost equal. 

### selling price - transmission

In [None]:
sns.boxplot(x="Transmission",y="Selling_Price",data=df)

### selling price-Seller type

In [None]:
sns.violinplot(x="Seller_Type",y="Selling_Price",data=df)

### selling price - age

In [None]:
sns.barplot("AgeCar","Selling_Price",data=final_dataset,palette="summer")

In [None]:
x=final_dataset.iloc[:,1:]
y=final_dataset.iloc[:,0]


In [None]:
x.head()

In [None]:
y.head()

## Feature Importance

In [None]:
# Feature Importance
from sklearn.ensemble import ExtraTreesRegressor
model=ExtraTreesRegressor()
model.fit(x,y)

In [None]:
print(model.feature_importances_)#For Random Forest Regression

* The first value belong to Present_price and so on
* The most important feature is Present_price

In [None]:
#visualizaiton for feature importance
f_importances=pd.Series(model.feature_importances_,index=x.columns)#pd.series x kolonlarını veri ile eşleştiriyor.
f_importances.nlargest(5).plot(kind='barh')
plt.show()

* This methods are good option if we have a lot of feataures.

## Modelling

In [None]:
from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
X_train.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_random=RandomForestRegressor()

In [None]:
#Randomized Search
# linspace: Return evenly spaced numbers over a specified interval.
#The number of trees in the forest.
n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=12)]

#The number of features to consider when looking for the best split.
max_features=['auto','sqrt']

#The maximum depth of the tree.
max_depth=[int(x) for x in np.linspace(5,30,num=6)]

#The minimum number of samples required to split an internal node.
min_samples_split=[2,5,10,15,100]

#The minimum number of samples required to be at a leaf node.
min_samples_leaf=[1,2,5,10]


In [None]:
from sklearn.model_selection import RandomizedSearchCV


In [None]:
random_grid={"n_estimators":n_estimators,
             "max_features":max_features,
             "max_depth":max_depth,
             "min_samples_split":min_samples_split,
             "min_samples_leaf":min_samples_leaf
             }
print(random_grid)

In [None]:
rf=RandomForestRegressor()

In [None]:
rf_random=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,scoring="neg_mean_squared_error",n_iter=10,cv=5,verbose=2,random_state=42,n_jobs=1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
#outliers dene farklı şeyler dene

In [None]:
predictions=rf_random.predict(x_test)
predictions

In [None]:
sns.distplot(y_test-predictions)

In [None]:
plt.scatter(y_test,predictions)
            
plt.xlabel('y_test', fontsize=18)                          
plt.ylabel('y_prediction', fontsize=16)

In [None]:
from sklearn.metrics import r2_score
R2 = r2_score(y_test,predictions)
R2