In [None]:
# Importing Libraries and Data
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv("../input/craigslist-carstrucks-data/vehicles.csv")

# Table of Content
## 1) Exploratory Date Analysis
### a) Understanding data & cleaning dataset
### b) Visualizing variables and relationships
## 2) Data Modelling
## 3) Feature Importance

# 1) Exploratory Data Analysis

## a) Understanding data & cleaning dataset


In [None]:
# Get a quick glimpse of what I'm working with
print(df.shape)
print(df.columns)
df.head()

In [None]:
# Want to better understand my numerical variables, specifically the min and max (range)
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

In [None]:
# Want to better understand categorical data
df.nunique(axis=0)

In [None]:
df.dtypes

In [None]:
# Remove columns with more than 40% missing values
NA_val = df.isna().sum()

def na_filter(na, threshold = .4): #only select variables that passees the threshold
    col_pass = []
    for i in na.keys():
        if na[i]/df.shape[0]<threshold:
            col_pass.append(i)
    return col_pass

df_cleaned = df[na_filter(NA_val)]
df_cleaned.columns

In [None]:
### Getting rid of outliers for dependent variable ###
df_cleaned = df_cleaned[df_cleaned['price'].between(999.99, 250000)] # need to first get rid of unrealistic points to compute IQR more accurately
df_cleaned.describe()

In [None]:
# Computing IQR
Q1 = df_cleaned['price'].quantile(0.25)
Q3 = df_cleaned['price'].quantile(0.75)
IQR = Q3 - Q1

# Filtering Values between Q1-1.5IQR and Q3+1.5IQR
df_filtered = df_cleaned.query('(@Q1 - 1.5 * @IQR) <= price <= (@Q3 + 1.5 * @IQR)')
df_filtered.boxplot('price')

In [None]:
# Checking values again
df_filtered.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

In [None]:
# Removing unrealistic outliers for independent variables

df_filtered = df_filtered[df_filtered['year'].between(1900, 2020)] # cant be newer than 2020
df_filtered = df_filtered[df_filtered['odometer'].between(0, 271431.5)] # = 140000 + 1.5 * (140000-52379)
print(df_filtered.shape)
print(df_filtered.columns)

In [None]:
# summary of NA values present
df_filtered.isna().sum()

In [None]:
# Dropping last few columns

df_final = df_filtered.copy().drop(['id','url','region_url','image_url','region','description','model','state','paint_color'], axis=1) #removing region since lat/long mean same thing
df_final.shape

In [None]:
# Dropping rows with null values
df_final = df_final.dropna(axis=0)
df_final.shape

## b) Visualizing variables and relationships

In [None]:
import matplotlib.pylab as plt
import seaborn as sns

# calculate correlation matrix
corr = df_final.corr()
# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
# sns.pairplot(df_final)

### Can also use the following if I want to narrow on specific variables ###

# histogram: df_cleaned['price'].plot(kind='hist', bins=50, figsize=(12,6), facecolor='grey',edgecolor='black')
# boxplot: df_cleaned.boxplot('odometer')
# scatterplot: df_cleaned.plot(kind='scatter', x='year', y='price')

In [None]:
df_final['manufacturer'].value_counts().plot(kind='bar')

In [None]:
df_cleaned['type'].value_counts().plot(kind='bar')

## 2) Data Modelling

In [None]:
# Converting categorical variables into dummy variables
df_final = pd.get_dummies(df_final, drop_first=True)
print(df_final.columns)

In [None]:
# Scaling the data
from sklearn.preprocessing import StandardScaler
X_head = df_final.iloc[:, df_final.columns != 'price']

X = df_final.loc[:, df_final.columns != 'price']
y = df_final['price']
X = StandardScaler().fit_transform(X)

In [None]:
# Creating the model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0)
model = RandomForestRegressor(random_state=1)
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [None]:
# Checking accuracy of model
print(mae(y_test, pred))
print(df_final['price'].mean())

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, pred)

## 3) Feature Importance

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=X_head.columns)
feat_importances.nlargest(25).plot(kind='barh',figsize=(10,10))