In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR

import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max.columns', None)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/merc.csv')

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
df = df.drop_duplicates(keep='first').reset_index(drop=True)

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
sns.countplot(df["transmission"])

In [None]:
# Determine columns by data types
cat_mask = (df.dtypes == np.object)
num_mask = (df.dtypes == np.float64) | (df.dtypes == np.int64)

cat_cols = df.columns[cat_mask].tolist()
num_cols = df.columns[num_mask].tolist()

print(f'Categorical columns: {cat_cols}')
print(f'Numerical columns: {num_cols}')

In [None]:
print(df["model"].value_counts() / len(df))
sns.countplot(y = df["model"])

In [None]:
sns.countplot(df["fuelType"])

In [None]:
sns.countplot(y = df["year"])

In [None]:
# Remove leading spaces
df['model'] = df['model'].str.strip(' ')

In [None]:
# Boxplot of model and price
fig, ax = plt.subplots(figsize=(15,8))
order = sorted(list(df['model'].unique()))
sns.boxplot(x='model', y='price', data=df, order=order, ax=ax)
plt.xticks(rotation=45)
plt.title('Price by Model', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(15,10),facecolor='w') 
sns.scatterplot(df["mileage"], df["price"], hue = df["year"])

In [None]:
plt.figure(figsize=(15,5),facecolor='w') 
sns.scatterplot(df["mileage"], df["price"], hue = df["fuelType"])

In [None]:
sns.pairplot(df)

In [None]:
#df = df[df['year'] <= 2020]
#df['age'] = 2020 - df['year']
#df = df.drop(['year'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
train_and_val, test = train_test_split(df, test_size=0.2, random_state=0)
print(f'Training and validation set size: {train_and_val.shape}')
print(f'Test set size: {test.shape}')

In [None]:
train, val = train_test_split(train_and_val, test_size=0.25, random_state=0)
print(f'Training set size: {train.shape}')
print(f'Validation set size: {val.shape}')

In [None]:
cat_mask = (df.dtypes == np.object)
num_mask = (df.dtypes == np.float64) | (df.dtypes == np.int64)

cat_cols =df.columns[cat_mask].tolist()
num_cols = df.columns[num_mask].tolist()

print(f'Categorical columns: {cat_cols}')
print(f'Numerical columns: {num_cols}')

In [None]:
def skew_df(data: pd.DataFrame, skew_limit: float) -> pd.DataFrame:
    # Define a limit above which we will transform
    skew_vals = data.skew()

    # Showing the skewed columns
    skew_cols = (skew_vals
                 .sort_values(ascending=False)
                 .to_frame('Skew')
                 .query('abs(Skew) > {}'.format(skew_limit))
    )
    return skew_cols
skew_cols = skew_df(train[num_cols], 0.75)
skew_cols

In [None]:
# Apply square root transformation on predictors only
train_sqrt = df[num_cols].drop('price', axis=1).copy()
for col in list(skew_cols.index):
    if col != 'price':
        train_sqrt[col] = train_sqrt[col].apply(np.sqrt)

# Check again
skew_df(train_sqrt, 0.75)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
calc_vif(train_sqrt)

In [None]:
df['model'] = df['model'].str.strip(' ')

In [None]:
df = train_and_val.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder
Model=pd.get_dummies(df['model'],drop_first=True)

In [None]:
Model

In [None]:
df=df.drop(['model'],axis=1)

In [None]:
df=pd.concat([df,Model],axis=1)

In [None]:
df

In [None]:
df=df.drop(['year'],axis=1)

In [None]:
le=LabelEncoder()
df['transmission']=le.fit_transform(df['transmission'])

In [None]:
df['fuelType']=le.fit_transform(df['fuelType'])

In [None]:
x=df.drop(['price'],axis=1)

In [None]:
y=df['price']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.33)


In [None]:
from sklearn.linear_model import LinearRegression
pipe = Pipeline([('scaler', StandardScaler()), ('LinReg', LinearRegression())])

In [None]:
pipe.fit(x_train, y_train)

In [None]:
pipe.score(x_train, y_train)

In [None]:
y_pred = pipe.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
r2_lin = r2_score(y_test, y_pred)

In [None]:
def plot_learning_curves (model, x, y):
    X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.3)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_pred = model.predict(X_train[:m])
        y_val_pred = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train_pred, y_train[:m]))
        val_errors.append(mean_squared_error(y_val_pred, y_val))
        plt.plot(np.sqrt(train_errors), 'r--', linewidth=2, label='train')
        plt.plot(np.sqrt(val_errors), 'b--', linewidth=2, label='val')
        plt.ylabel('RMSE')
        plt.xlabel('Number of samples')

linReg = LinearRegression()
plot_learning_curves(linReg, x[:200], y[:200])