In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor

In [None]:
audi_df = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/audi.csv')
audi_df.shape

In [None]:
audi_df.columns

In [None]:
audi_df.isna().sum()

In [None]:
audi_df.head()

In [None]:
audi_df['model'].value_counts()

## **Replacing  model to others which are < 100**

In [None]:
audi_models = audi_df['model'].value_counts()

others = audi_models[audi_models < 100].keys()

others

In [None]:
audi_df['model'].replace(others,['Others' for i in range(len(others))], inplace=True)

In [None]:
plt.figure(figsize=(10, 10))
plt.style.use('fivethirtyeight')
audi_df['model'].value_counts().plot(kind='barh')
plt.title('Audi Models')
plt.gca().invert_yaxis()
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
plt.style.use('fivethirtyeight')
audi_df['transmission'].value_counts().plot(kind='barh')
plt.title('Transmission Types')
plt.gca().invert_yaxis()
plt.show()

## Fuel Type 

In [None]:
audi_df['fuelType'].value_counts()

### Dropping Hybrid fuel type because it is too less 

In [None]:
audi_df = audi_df[audi_df['fuelType'] != 'Hybrid']

In [None]:
audi_df.shape

In [None]:
audi_df.sample(10)

In [None]:
audi_df[['price', 'mileage', 'tax', 'mpg', 'engineSize']].describe()

In [None]:
audi_df['year'].unique()

In [None]:
audi_df['year'].value_counts()

In [None]:
year_count = audi_df['year'].value_counts()

year_count[year_count<100].sum()

## Replacing years from 1997-2012 to before 2013

In [None]:
year_keys = year_count[year_count<100].keys()
year_keys

In [None]:
audi_df['year'].replace(year_keys, ['before 2013' for i in range(len(year_keys))], inplace=True)

In [None]:
plt.figure(figsize=(10, 10))
plt.style.use('fivethirtyeight')
audi_df['year'].value_counts().plot(kind='barh')
plt.title('Year Frequency')
plt.gca().invert_yaxis()
plt.show()

## Ranking the years 

In [None]:
years = audi_df['year'].value_counts().keys()

In [None]:
year_ranks = [i/10 for i in range(1,len(years)+1)]

In [None]:
audi_df['year'].replace(years, year_ranks, inplace=True)
audi_df['year'].value_counts()

In [None]:
audi_df.head()

## One Hot encoding 

In [None]:
cols_to_encode = ['model', 'fuelType','transmission']
df_dummies = pd.get_dummies(audi_df[cols_to_encode],drop_first=True)

## Feature Scaling  

In [None]:
cols_to_scale = ['mileage', 'tax', 'mpg', 'engineSize']

scale = MinMaxScaler()
scale.fit(audi_df[cols_to_scale])

In [None]:
scaled = scale.fit_transform(audi_df[cols_to_scale])
scaled

In [None]:

i = 0
for col in cols_to_scale:
    audi_df[col] = scaled[:,i]
    i += 1

In [None]:
scaled_cols = audi_df[cols_to_scale]

In [None]:
df = pd.concat([df_dummies, scaled_cols, audi_df['year'],audi_df['price']], axis=1)

In [None]:
new_df = df[df['year']!=0.9]

In [None]:
new_df.shape

## Splitting and Training

In [None]:
x, y = new_df.drop('price', axis=1), new_df['price']

In [None]:
x.shape, y.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.85,)

In [None]:
x_train.shape, x_test.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
model.score(x_test, y_test)

In [None]:
model.score(x_train, y_train)

In [None]:
min_mil = new_df['mileage'].quantile(0.05)
max_mil = new_df['mileage'].quantile(0.95)

In [None]:
min_tax = new_df['tax'].quantile(0.05)
max_tax = new_df['tax'].quantile(0.95)

In [None]:
new_df2 = new_df[(new_df['mileage']>min_mil) & (new_df['mileage']<max_mil)]

In [None]:
new_df2.shape

In [None]:
new_df2 = new_df[(new_df['tax']>min_tax) & (new_df['tax']<max_tax)]
new_df2.shape

In [None]:
new_df2.head()

In [None]:
x2, y2 = new_df2.drop(['price'], axis=1), new_df2['price']

In [None]:
x2.shape, y2.shape

In [None]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, train_size=0.85,)

In [None]:
x_train2.shape, x_test2.shape

In [None]:
y_train2.shape, y_test2.shape

In [None]:
model2 = LinearRegression()
model2.fit(x_train2, y_train2)

In [None]:
y_pred_test = model2.predict(x_test2)
y_pred_train = model2.predict(x_train2)

In [None]:
mean_squared_error(y_test2, y_pred_test)

In [None]:
mean_squared_error(y_train2, y_pred_train)

In [None]:
model2.score(x_test2, y_test2)

In [None]:
model2.score(x_train2, y_train2)

# **Cross val scores**

In [None]:
reg = cross_val_score(LinearRegression(), x2, y2, cv=10)
reg

In [None]:
rid = cross_val_score(Ridge(), x2, y2, cv=10)
rid

In [None]:
las = cross_val_score(Lasso(), x2, y2, cv=10)
las

In [None]:
reg.mean(), rid.mean(), las.mean()

In [None]:
svr = cross_val_score(SVR(kernel='linear'), x2, y2, cv=10)
svr

In [None]:
knr = cross_val_score(KNeighborsRegressor(), x2, y2, cv=10)
knr

In [None]:
knr = cross_val_score(KNeighborsRegressor(n_neighbors=11,weights='distance',metric='euclidean'), x2, y2, cv=10)
knr

## As we can see  KNeighborsRegressor is giving more score

In [None]:
kmodel = KNeighborsRegressor(n_neighbors=11,weights='distance',metric='euclidean')
kmodel.fit(x_train2, y_train2)

In [None]:
kmodel.score(x_test2, y_test2)

In [None]:
kmodel.score(x_train2, y_train2)

In [None]:
y_pred_test = model.predict(x_test2)
y_pred_train = model.predict(x_train2)

In [None]:
test_y = pd.DataFrame({'Y test':y_test2,'Y Pred':y_pred_test})
train_y = pd.DataFrame({'Y test':y_train2,'Y Pred':y_pred_train})


In [None]:
train_y.head(10)

## **Hyper parameter tuning**

In [None]:
grid = {
     'n_neighbors':[5, 11, 19, 35, 57],
     'weights':['distance','uniform'],
     'metric':['manhattan','euclidean','minkowski']
}
clf = GridSearchCV(KNeighborsRegressor(),grid, cv=5)

In [None]:
clf.fit(x_train2, y_train2)

In [None]:
cv_result = pd.DataFrame(clf.cv_results_)
cv_result

In [None]:
cv_result[['param_metric', 'param_n_neighbors', 'params', 'mean_test_score', 'mean_test_score']]

In [None]:
clf.best_estimator_

In [None]:
kmodel = KNeighborsRegressor(n_neighbors=7, weights='distance',metric='manhattan')
kmodel.fit(x_train2, y_train2)

In [None]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, train_size=0.50,)

In [None]:
kmodel.score(x_test2, y_test2)

In [None]:
kmodel.score(x_train2, y_train2)

In [None]:
x_train2.shape

In [None]:
x_test2.shape

## Omg Score Reached 98 when splitted  data into 50%

In [None]:
y_pred_test = model.predict(x_test2)
y_pred_train = model.predict(x_train2)
test_y = pd.DataFrame({'Y test':y_test2,'Y Pred':y_pred_test})
train_y = pd.DataFrame({'Y test':y_train2,'Y Pred':y_pred_train})

In [None]:
test_y.sample(10)

In [None]:
test_y.corr()

In [None]:
train_y.sample(10)

In [None]:
train_y.corr()