In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import make_column_transformer, ColumnTransformer
import statsmodels.api as sm
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
sns.set()

In [2]:
data = pd.read_csv('/Users/priti16/Downloads/practical_application_II_starter-2/data/vehicles.csv')
data.head()

Unnamed: 0,id,region,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,state
0,7222695916,prescott,6000,,,,,,,,,,,,,,,az
1,7218891961,fayetteville,11900,,,,,,,,,,,,,,,ar
2,7221797935,florida keys,21000,,,,,,,,,,,,,,,fl
3,7222270760,worcester / central MA,1500,,,,,,,,,,,,,,,ma
4,7210384030,greensboro,4900,,,,,,,,,,,,,,,nc


In [3]:
df = data.copy()

In [4]:
df.shape

(426880, 18)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 18 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            426880 non-null  int64  
 1   region        426880 non-null  object 
 2   price         426880 non-null  int64  
 3   year          425675 non-null  float64
 4   manufacturer  409234 non-null  object 
 5   model         421603 non-null  object 
 6   condition     252776 non-null  object 
 7   cylinders     249202 non-null  object 
 8   fuel          423867 non-null  object 
 9   odometer      422480 non-null  float64
 10  title_status  418638 non-null  object 
 11  transmission  424324 non-null  object 
 12  VIN           265838 non-null  object 
 13  drive         296313 non-null  object 
 14  size          120519 non-null  object 
 15  type          334022 non-null  object 
 16  paint_color   296677 non-null  object 
 17  state         426880 non-null  object 
dtypes: f

In [6]:
df.describe().round(2)

Unnamed: 0,id,price,year,odometer
count,426880.0,426880.0,425675.0,422480.0
mean,7311487000.0,75199.03,2011.24,98043.33
std,4473170.0,12182280.0,9.45,213881.5
min,7207408000.0,0.0,1900.0,0.0
25%,7308143000.0,5900.0,2008.0,37704.0
50%,7312621000.0,13950.0,2013.0,85548.0
75%,7315254000.0,26485.75,2017.0,133542.5
max,7317101000.0,3736929000.0,2022.0,10000000.0


In [7]:
df.describe(include='object')

Unnamed: 0,region,manufacturer,model,condition,cylinders,fuel,title_status,transmission,VIN,drive,size,type,paint_color,state
count,426880,409234,421603,252776,249202,423867,418638,424324,265838,296313,120519,334022,296677,426880
unique,404,42,29649,6,8,5,6,3,118246,3,4,13,12,51
top,columbus,ford,f-150,good,6 cylinders,gas,clean,automatic,1FMJU1JT1HEA52352,4wd,full-size,sedan,white,ca
freq,3608,70985,8009,121456,94169,356209,405117,336524,261,131904,63465,87056,79285,50614


In [8]:
df['price'].unique()

array([ 6000, 11900, 21000, ...,  6328, 19853, 17873])

In [9]:
df['price'] = [0 if i=='-' else int(i) for i in df['price']]

In [10]:
df['price'].dtype

dtype('int64')

In [11]:
df['manufacturer'].unique()

array([nan, 'gmc', 'chevrolet', 'toyota', 'ford', 'jeep', 'nissan', 'ram',
       'mazda', 'cadillac', 'honda', 'dodge', 'lexus', 'jaguar', 'buick',
       'chrysler', 'volvo', 'audi', 'infiniti', 'lincoln', 'alfa-romeo',
       'subaru', 'acura', 'hyundai', 'mercedes-benz', 'bmw', 'mitsubishi',
       'volkswagen', 'porsche', 'kia', 'rover', 'ferrari', 'mini',
       'pontiac', 'fiat', 'tesla', 'saturn', 'mercury', 'harley-davidson',
       'datsun', 'aston-martin', 'land rover', 'morgan'], dtype=object)

In [12]:
df['manufacturer'] = [float(i.split()[0]) for i in df['manufacturer']]

AttributeError: 'float' object has no attribute 'split'

In [None]:
df['manufacturer'].dtype

In [None]:
df['odometer'].unique()

In [None]:
df['odometer'] = [float(i.split()[0]) for i in df['odometer']]

In [None]:
df['odometer'].dtype

In [None]:
df.head()

In [None]:
df.drop('title_status',axis=1, inplace=True)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.dropna()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.drop('id',axis=1,inplace=True) # Here ID column was dropped because there is no importance of the column

In [None]:
df = df.reset_index(drop=True)

In [None]:
df

In [None]:
numerical_data = df.select_dtypes(include='number')
numerical_data.head()

In [None]:
categorical_data = df.select_dtypes(include='object')
categorical_data.head()

In [None]:
fig, axes = plt.subplots(3,2, figsize=(10, 10))

axes[0,0].hist(numerical_data['price'])
axes[0,1].hist(numerical_data['year'])
axes[1,0].hist(numerical_data['odometer'])


axes[0, 0].set_title('price')
axes[0, 1].set_title('year')
axes[1, 0].set_title('odometer')


plt.suptitle('Histograms of Numerical Features', fontsize = 16)
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(3,2, figsize=(10, 10))

sns.kdeplot(numerical_data['price'], fill=True, ax=axes[0,0])
sns.kdeplot(numerical_data['year'], fill=True, ax=axes[0,1])
sns.kdeplot(numerical_data['odometer'], fill=True, ax=axes[1,0])

fig.suptitle('Distribution of Numerical Features', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(3,2, figsize=(10, 10))

axes[0,0].boxplot(numerical_data['price'])
axes[0,1].boxplot(numerical_data['year'])
axes[1,0].boxplot(numerical_data['odometer'])


axes[0, 0].set_title('price')
axes[0, 1].set_title('year')
axes[1, 0].set_title('odometer')

plt.suptitle('Boxplots of Numerical Features', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
price_selected = numerical_data[numerical_data['price']<100000]

In [None]:
len(price_selected)

In [None]:
price_selected = price_selected.reset_index()

In [None]:
price_selected

In [None]:
plt.hist(price_selected['price'])
plt.show()

In [None]:
plt.boxplot(price_selected['price'])
plt.show()

In [None]:
sns.kdeplot(np.sqrt(price_selected['price']), fill=True)
plt.show()

In [None]:
class IQR:
    def __init__(self, feature, data):
        self.feature = feature
        self.data = data

    def calculate_iqr(self):
        q1 = np.percentile(self.data[self.feature], 25)
        q3 = np.percentile(self.data[self.feature], 75)
        iqr = q3 - q1
        lower_limit = q1 - 1.5 * iqr
        upper_limit = q3 + 1.5 * iqr
        self.outliers = self.data[((self.data[self.feature] < lower_limit) | (self.data[self.feature] > upper_limit))]
        return  {
            'q1': q1,
            'q3': q3,
            'iqr': iqr,
            'lower_limit': lower_limit,
            'upper_limit': upper_limit
        }

In [None]:
numerical_data.columns

In [None]:
year_iqr = IQR('year', price_selected)

In [None]:
year_iqr.calculate_iqr()

In [None]:
year_iqr.outliers

In [None]:
year_iqr.outliers['price'].values

In [None]:
year_iqr.outliers.index

In [None]:
year_selected = price_selected.iloc[~price_selected.index.isin(year_iqr.outliers.index)]

In [None]:
year_selected

In [None]:
len(year_selected)

In [None]:
plt.hist(year_selected['year'])
plt.show()

In [None]:
plt.boxplot(year_selected['year'])
plt.show()

In [None]:
numerical_data.columns

In [None]:
odometer_iqr = IQR('odometer', year_selected)

In [None]:
odometer_iqr.calculate_iqr()

In [None]:
odometer_iqr.outliers

In [None]:
odometer_iqr.outliers.describe()['odometer']

In [None]:
odometer_selected = year_selected[year_selected['odometer']<600000]

In [None]:
odometer_selected

In [None]:
plt.hist(odometer_selected['odometer'])
plt.show()

In [None]:
plt.boxplot(odometer_selected['odometer'])
plt.show()

In [None]:
fig, axes = plt.subplots(3,2, figsize=(10, 10))

sns.kdeplot(odometer_selected['price'], fill=True, ax=axes[0,0])
sns.kdeplot(odometer_selected['year'], fill=True, ax=axes[0,1])
sns.kdeplot(odometer_selected['odometer'], fill=True, ax=axes[1,0])


fig.suptitle('Distribution of Numerical Features in Final Dataset', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(3,2, figsize=(10, 10))

axes[0,0].boxplot(odometer_selected['price'])
axes[0,1].boxplot(odometer_selected['year'])
axes[1,0].boxplot(odometer_selected['odometer'])

axes[0, 0].set_title('price')
axes[0, 1].set_title('year')
axes[1, 0].set_title('odometer')

plt.suptitle('Boxplots of Numerical Features in Final Dataset', fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
numerical_selected = odometer_selected.reset_index(drop=True)

In [None]:
numerical_selected

In [None]:
numerical_selected['index'].values

In [None]:
categorical_data = categorical_data.iloc[numerical_selected['index'].values]

In [None]:
categorical_data = categorical_data.reset_index(drop=True)

In [None]:
categorical_data

In [None]:
df2 = pd.concat([numerical_selected, categorical_data], axis=1)
df2.head()

In [None]:
df2.shape

In [None]:
categorical_data

In [None]:
fig = plt.figure(figsize=(10,20))

ax1 = plt.subplot(5,2,1)
categorical_data['manufacturer'].value_counts().to_frame().plot(kind = 'bar', ax=ax1)

ax2 = plt.subplot(5,2,2)
categorical_data['model'].value_counts().to_frame().plot(kind = 'bar', ax=ax2)

ax3 = plt.subplot(5,2,3)
categorical_data['condition'].value_counts().to_frame().plot(kind = 'bar', ax=ax3)

plt.tight_layout()
plt.show()

In [None]:
df2_categorical = list(df2.columns[df2.dtypes=='object'])
df2_categorical

In [None]:
fig = plt.figure(figsize=(10,20))

for i in df2_categorical:
    ax = plt.subplot(5,2,df2_categorical.index(i)+10)
    df2.pivot_table(values='price', index=i, aggfunc='mean').sort_values(by='price').plot(kind='bar', ax=ax)
    plt.title('Average Price per {}'.format(i))
plt.tight_layout()
plt.show()

In [None]:
df2_numerical = list(df2.columns[df2.dtypes!='object'])
df2_numerical

In [None]:
fig = plt.figure(figsize=(10,20))

for i in df2_numerical:
    ax = plt.subplot(4,2,df2_numerical.index(i)+8)
    sns.scatterplot(x=df2[i], y=df2['price'], ax=ax)
    
plt.tight_layout()
plt.show()

In [None]:
df2.head()

In [None]:
X = df2.iloc[:,1:]
y = df2['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

In [None]:
sns.pairplot(X_train)
plt.show()

In [None]:
corr_matrix = df2.corr(numeric_only=True)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool),k=1)
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix,annot=True,cmap='RdBu',vmin=-1,vmax=1,mask=mask)
plt.show()

In [None]:
x_numeric_train = X_train.select_dtypes('number')
x = sm.add_constant(x_numeric_train)
results = sm.OLS(y_train,x).fit()
results.summary()

In [None]:
x_numeric_train

In [None]:
mutual_info = mutual_info_regression(x_numeric_train, y_train)
mutual_info = pd.Series(mutual_info)
mutual_info.index = x_numeric_train.columns
mutual_info.sort_values(ascending=False)

In [None]:
mutual_info.sort_values(ascending=False).plot(kind='bar')
plt.show()

In [None]:
X_train.drop('odometer', axis=1, inplace=True)
X_test.drop('odometer', axis=1, inplace=True)

In [None]:
x_numeric_train = X_train.select_dtypes('number')
x = sm.add_constant(x_numeric_train)
results = sm.OLS(y_train,x).fit()
results.summary()

In [None]:
categorical_preprocessor = Pipeline(
    steps = [('ohe', OneHotEncoder(drop='first'))]
)

In [None]:
numerical_preprocessor = Pipeline(
    steps = [('minmaxscaler' , MinMaxScaler())]
)

In [None]:
preprocessor = ColumnTransformer(
    [('odometer', categorical_preprocessor, list(X_train.select_dtypes('object').columns)),
    ('numerical', numerical_preprocessor, list(X_train.select_dtypes('number').columns))]   
)

In [None]:
preprocessor

In [None]:
OneHotEncoder(handle_unknown='ignore')

In [None]:
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

In [None]:
X_train_preproccessed = preprocessor.fit_transform(X_train)
X_test_preproccessed = preprocessor.transform(X_test)

In [None]:
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

In [None]:
X_train_preproccessed.shape

In [None]:
cval = KFold(n_splits=5, shuffle=True, random_state=99)

In [None]:
models = []
avg_errors = []

In [None]:
lr = LinearRegression()

In [None]:
errors_lr = -cross_val_score(estimator=lr,                  
                X=X_train_preproccessed,
                y=y_train,
                cv=cval,
                scoring='neg_root_mean_squared_error')

print('Errors {}'.format(errors_lr))
print()
print('Avg_error {}'.format(np.mean(errors_lr)))

avg_errors.append(np.mean(errors_lr))
models.append('Linear Regression')

In [None]:
neighbors = []
errors = []

for i in range(1, 20):
    knn = KNeighborsRegressor(n_neighbors=i)
    model = knn.fit(X_train_preproccessed, y_train)
    error = np.mean(-cross_val_score(estimator=model,                  
                        X=X_train_preproccessed,
                        y=y_train,
                        cv=cval,
                        scoring='neg_root_mean_squared_error'))
    neighbors.append(i)
    errors.append(error)

In [None]:
plt.plot(neighbors, errors)
plt.show()

In [None]:
np.argmin(errors)+1

In [None]:
knn = KNeighborsRegressor(n_neighbors=4)

In [None]:
errors_knn = -cross_val_score(estimator=knn,                  
                 X=X_train_preproccessed,
                 y=y_train,
                 cv=5,
                 scoring='neg_root_mean_squared_error')

print('Errors {}'.format(errors_knn))
print()
print('Avg_error {}'.format(np.mean(errors_knn)))

avg_errors.append(np.mean(errors_knn))
models.append('KNeighbors') 

In [None]:
dt = DecisionTreeRegressor(random_state=99)

In [None]:
errors_dt = -cross_val_score(estimator=dt,                  
                 X=X_train_preproccessed,
                 y=y_train,
                 cv=cval,
                 scoring='neg_root_mean_squared_error')

print('Errors {}'.format(errors_dt))
print()
print('Avg_error {}'.format(np.mean(errors_dt)))

avg_errors.append(np.mean(errors_dt))
models.append('Decision Tree')  

In [None]:
rf = RandomForestRegressor(random_state=99)

In [None]:
errors_rf = -cross_val_score(estimator=rf,                  
                 X=X_train_preproccessed,
                 y=y_train,
                 cv=cval,
                 scoring='neg_root_mean_squared_error')

print('Errors {}'.format(errors_rf))
print()
print('Avg_error {}'.format(np.mean(errors_rf)))

avg_errors.append(np.mean(errors_rf))
models.append('Random Forest')  

In [None]:
svr = SVR()

In [None]:
errors_svr = -cross_val_score(estimator=svr,                  
                 X=X_train_preproccessed,
                 y=y_train,
                 cv=cval,
                 scoring='neg_root_mean_squared_error')

print('Errors {}'.format(errors_svr))
print()
print('Avg_error {}'.format(np.mean(errors_svr)))

avg_errors.append(np.mean(errors_svr))
models.append('SVR')  

In [None]:
compare_models = pd.DataFrame({'Model':models, 'MSE':avg_errors}).sort_values(by='MSE', ascending=True)
compare_models

In [None]:
compare_models.plot(kind = 'bar')
plt.xlabel('Model')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('Mean Squared Error for Different Models')
plt.xticks(range(len(compare_models)) ,compare_models['Model'] ,rotation=45, ha='right')  # Rotate x-axis labels for better visibility
plt.tight_layout()
plt.show()