In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.graph_objects as go
import plotly.express as px

from sklearn import preprocessing


import seaborn as sns  # plotting & visualization lib
import matplotlib.pyplot as plt  #plot & visualization lib
%config InlineBackend.figure_format = 'svg'
import os
import warnings # to ignore warnings
warnings.simplefilter('ignore')
from mpl_toolkits.basemap import Basemap #this library is used to create maps
print(os.listdir("../input"))


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#reading csv file 
df = pd.read_csv('/kaggle/input/craigslist-carstrucks-data/vehicles.csv')
df

In [None]:
df.describe()

In [None]:
#checking total no. of null values in each col
df.info()

In [None]:
#here we'll find the percentage of null values in the dataset
null_values=df.isnull().sum()
null_values=pd.DataFrame(null_values,columns=['null'])
j=1
sum_tot=len(df)
null_values['percent']=null_values['null']/sum_tot
round(null_values*100,3).sort_values('percent',ascending=False)

In [None]:
#removing county columns as it is not useful and has almost 100 percent null values
del df['county']

## I tried removing all the null values but in the end we'll end up deleting almost all the data so we need to impute/put some new values for the missing or null values in the dataset

In [None]:
#here we'll find the total null values in the dataset
null_values=df.isnull().sum()
null_values=pd.DataFrame(null_values,columns=['null'])
j=1
sum_tot=len(df)
null_values['percent']=null_values['null']/sum_tot
round(null_values*100,3).sort_values('percent',ascending=False)

In [None]:
#removing some columns that are not useful for anaylsis
df= df.drop(columns=['url','image_url','region_url', 'description','VIN'], axis=1)

In [None]:
#Comparing paint_color,price on the map

figure = px.scatter_mapbox(df[df["type"]=="bus"],lon="long", lat="lat",  hover_name="paint_color", hover_data=["paint_color", "price"],
                        zoom=4, height=550)
figure.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

figure.update_layout(mapbox_style="open-street-map")

figure.show()

In [None]:
# Analysing Price columns

print('The average value for price columns is', df.price.mean())

print('Top 10 most used prices number are')
print(df.price.value_counts().iloc[0:10])
print("Null values inside the price columns are",df.price.isnull().sum())


##### I found that prices car were sold is mostly 0 which is not possible as they must be sold at some prices.we'll remove all the rows with 0 price points 

In [None]:
sns.boxplot(data = df.price)

In [None]:
#removing vague/dirty values in price some values are above limit
index_value_useless_price= df.price.sort_values(ascending=False).iloc[0:65].index
print(index_value_useless_price)

In [None]:
df2 = df.drop(df.index[[318592, 356716, 257840,  91576,  37410, 184704, 153082,  29386,
             37409, 122470,    280, 230753, 193736, 288356, 307488, 358555,
            137807, 207080, 241404,   1838, 379133, 136516, 303644, 286323,
            286324, 303014, 288401, 288400, 353641, 300308, 105843, 283906,
            327938,  68935, 286156, 377425, 384898, 377396, 367308, 155421,
            362822, 384872, 367357, 377515, 362837, 194292, 362867, 367296,
            356959, 283429,  26075, 213918, 219241, 233823,  91605, 232829,
            233201,  91807,  83367,  87052, 416145,  95119, 307671, 307513,
             79088]])

##### various dataset has vague price value such as 1234567, 11111111, and some above 3Billion, this expresses that these are useless values entered just to fill the price. We'll randomly remove first 500 extreme values. and also remove all the zero price value cars rows,

##### These will help to remove outliers from the data

In [None]:
#resetting the index
df2.reset_index()
df2

In [None]:
#removing prices with 0 values

df2 = df2[df2.price !=0]
    

In [None]:
#We found various values of odometer crossing 1Cr which is not possible. So we'll remvoe that
df2.odometer.sort_values(ascending=False).iloc[0:30]

In [None]:
#various dirty or useless number can be found in odometer reading such as 99999999,2222222,1234567
# which we can say is not useful to consider. We'll remove rows that has value of odometer above
# 1000000

df2 = df2[df2.odometer<1000000]

In [None]:
# Some cars are have manufacturing year below 1965 which is not of any help to consider for our model
# as value of car decreases in just 15 years
df2=df2[df2.year>1985]

In [None]:
# again checking the null value percentage

null_values=df2.isnull().sum()
null_values=pd.DataFrame(null_values,columns=['null'])
j=1
sum_tot=len(df2)
null_values['percent']=null_values['null']/sum_tot
round(null_values*100,3).sort_values('percent',ascending=False)

In [None]:
# Analyzing the state columns

plt.figure(figsize=(10,6))
ax = sns.countplot(x='state',data=df2,order=df2['state'].value_counts().index);
ax.set_xticklabels(ax.get_xticklabels(), fontsize=8);

In [None]:
#analyzing state vs sell number
plt.figure(figsize=(10,6))
ax = sns.countplot(x='year',data=df2);
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right",fontsize=6);

### The graph indicates a decrese in listings of 2009 car models. This might be because of recession.


In [None]:
print ("Total manufacturers we have are:")
print (df['manufacturer'].unique())

In [None]:
# LEts see contribution of each manufacturer in producing the total cars

plt.figure(figsize=(10,6))
ax = sns.countplot(x='manufacturer',data=df);
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right",fontsize=8);

### Ford has most no. of cars on the street followed by chevrolet and toyota.

In [None]:
df2.fuel.unique()

In [None]:
#CHecking which color is sold mostly in which type of cars.

gasLabels = df2[df2["fuel"]=="gas"].paint_color.value_counts().head(10).index
gasValues = df[df["fuel"]=="gas"].paint_color.value_counts().head(10).values
hybridLabels = df[df["fuel"]=="hybrid"].paint_color.value_counts().head(10).index
hybridValues = df[df["fuel"]=="hybrid"].paint_color.value_counts().head(10).values
dieselLabels = df[df["fuel"]=="diesel"].paint_color.value_counts().head(10).index
dieselValues = df[df["fuel"]=="diesel"].paint_color.value_counts().head(10).values
electricLabels = df[df["fuel"]=="electric"].paint_color.value_counts().head(10).index
electricValues = df[df["fuel"]=="electric"].paint_color.value_counts().head(10).values


from plotly.subplots import make_subplots

# Creating subplots for different domain
fig = make_subplots(rows=1, cols=5, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=gasLabels, values=gasValues, name="Gas Car"),
              1, 1)
fig.add_trace(go.Pie(labels=dieselLabels, values=dieselValues, name="Diesel Car"),
              1, 2)
fig.add_trace(go.Pie(labels=electricLabels, values=electricValues, name="Electric Car"),
              1, 3)
fig.add_trace(go.Pie(labels=hybridLabels, values=hybridValues, name="hybrid Car"),
              1, 4)
fig.update_traces(hole=.3, hoverinfo="label+percent+name") #hole to create donut

fig.show()

#### We can see white is most preferred color as it is majority in all types of car. Second is black and silver which denotes customers must be either buying more white and black cars or it has some technical effect of producing car with such colors

In [None]:
#LEts deal with missing values


print('the missing values in condition columns is ', df2.condition.isnull().sum())

#Assume car with year before 2019 is as new condition which we are adding
# and car below 2017 as like new

df2.loc[df2.year>=2019, 'condition'] = df2.loc[df2.year>=2019, 'condition'].fillna('new')
df2.loc[df2.year<=2017, 'condition'] = df2.loc[df2.year<=2017, 'condition'].fillna('like new')

In [None]:
# calculate correlation matrix
corr = df2.corr()# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, 
            yticklabels=corr.columns, annot=True)


#no correlation exist between any columns

In [None]:
df2.condition.value_counts() # we've managed to remove null values

In [None]:
#removing some more columns which has lot of unique values which will only confuse the model if 
# we used it
#removing columns with null values above 40 percent

df4 = df2.drop(axis = 1, columns=['size', 'long', 'lat', 'model', 'region','posting_date','state'])
df4.info()

In [None]:
# As the data is stored wrt time we'll fill the data with forward fill method

df4['transmission'] = df4['transmission'].fillna(method='ffill')
df4['cylinders'] = df4['cylinders'].fillna(method='ffill')
df4['title_status'] = df4['title_status'].fillna(method='ffill')
df4['fuel'] = df4['fuel'].fillna(method='ffill')
df4['paint_color'] = df4['paint_color'].fillna(method='ffill')
df4['drive'] = df4['drive'].fillna(method='ffill')
df4['manufacturer'] = df4['manufacturer'].fillna(method='ffill')
df4['type'] = df4['type'].fillna(method='ffill')

## LABEL PROCESSING

In [None]:
df4 = df4.drop(columns = ['id'], axis = 1)

In [None]:
#Label encoding each string columns
le = preprocessing.LabelEncoder()
df4['manufacturer'] = le.fit_transform(df4.manufacturer.astype(str))
df4['condition'] = le.fit_transform(df4.condition.astype(str))
df4['cylinders'] = le.fit_transform(df4.cylinders.astype(str))
df4['fuel'] = le.fit_transform(df4.fuel.astype(str))
df4['title_status'] = le.fit_transform(df4.title_status.astype(str))
df4['transmission'] = le.fit_transform(df4.transmission.astype(str))
df4['drive'] = le.fit_transform(df4.drive.astype(str))
df4['type'] = le.fit_transform(df4.type.astype(str))
df4['paint_color'] = le.fit_transform(df4.paint_color.astype(str))


In [None]:
#splitting datasets

from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

y= df4.price
X= df4.drop('price',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

In [None]:
X_train.info()

In [None]:
# Feature Scaling

from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()   
X_train= sc.fit_transform(X_train)
X_test= sc.transform(X_test)

In [None]:
#Write this line of code (if your dataframe name is df):
df4.to_csv('X_train_ML.csv',index=False)
# Hit commit and run at the right hand corner of the kernel.
# Wait till the kernel runs from top to bottom.
# Checkout the 'Output' Tab from the Version tab. Or go to the snapshot of your kernel and checkout the 'Output' tab. Your csv file will be there

In [None]:
import os
os.getcwd()

In [None]:
#As random forest works best for such kind of data We'll go with random forest


from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error as MSE

print('Mean Absolute Error:', round(metrics.mean_absolute_error(y_test, y_pred),2))
print('Mean Squared Error:', round(metrics.mean_squared_error(y_test, y_pred),2))
print('Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),2))

In [None]:
#lets try with 100 n_estimators first
regressor = RandomForestRegressor(n_estimators=100, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('Mean Absolute Error:', round(metrics.mean_absolute_error(y_test, y_pred),2))
print('Mean Squared Error:', round(metrics.mean_squared_error(y_test, y_pred),2))
print('Root Mean Squared Error:', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),2))

In [None]:

from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)




# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
print('Our best parameter for the randomforest is ')
print(rf_random.best_params_)

In [None]:
def evaluate(model, X_test, y_test):
    predictions = model.predict(y_test)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy


#We make the base model again just to check the accuracy 
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train,y_train)
base_accuracy = evaluate(base_model,X_test, y_test)

best_random = rf_random.best_estimator_

random_accuracy = evaluate(best_random, X_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))


In [None]:
#Gridsearch Cross validation

# Random search allowed us to narrow down the range for each hyperparameter. 
# Now that we know where to concentrate our search, we can explicitly specify every
# combination of settings to try using gridsearch cross validation


from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)

grid_search.best_params_

best_grid = grid_search.best_estimator_

grid_accuracy = evaluate(best_grid, test_features, test_labels)

print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

