In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import category_encoders as ce
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from scipy import stats

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/merc.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.describe()

### Exploratory Data Analysis

In [None]:
sns.countplot(df['transmission'])
plt.show()

Most of the cars on the dataset are with automatic and semi-automatic transmission with very few cars in automatic and semi-automatic transmission

In [None]:
print(df['model'].value_counts()/len(df))
plt.figure(figsize=(8,8))
sns.countplot(y = df['model'])
plt.show()

Top 3 cars are C Class,A Class,E Class

In [None]:
sns.countplot(df['fuelType'])
plt.show()

In [None]:
plt.figure(figsize=(15,5),facecolor='w')
sns.barplot(x= df['year'],y=df['price'])
plt.show()

In [None]:
sns.barplot(x= df['transmission'],y=df['price'])
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

### Feature Engineering 

In [None]:
df_new = df.copy()

In [None]:
df_new.head()

In [None]:
df_new = pd.get_dummies(df)

In [None]:
df_new.head()

In [None]:
df_new.shape

In [None]:
sns.distplot(df_new.loc[:,'price'],norm_hist=True)
plt.title('Histogram Before Transformation of data')
plt.show()
print("Skewness: " + str(df_new['price'].skew()))
print("Kurtosis: " + str(df_new['price'].kurt()))

In [None]:
log_df = df_new.copy()

In [None]:
log_df['price'] = np.log1p(df_new['price'])

In [None]:
sns.distplot(log_df.loc[:,'price'],norm_hist=True)
plt.title('Histogram After transformation of data')
plt.show()
print("Skewness: " + str(log_df['price'].skew()))
print("Kurtosis: " + str(log_df['price'].kurt()))

In [None]:
log_df.head()

In [None]:
X = log_df.drop('price',axis=1)
y = log_df['price']

### Model Training

In [None]:
lr = LinearRegression()

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=5)

In [None]:
lr.fit(x_train,y_train)

In [None]:
lr.score(x_test,y_test)

In [None]:
y_pred = lr.predict(x_test)

In [None]:
mse = mean_squared_error(y_test,y_pred)

In [None]:
r2 = r2_score(y_test,y_pred)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred))

In [None]:
print("MSE:- {}".format(mse))
print("R2 Score:- {}".format(r2))
print("RMSE:- {}".format(rmse))

In [None]:
results = x_test.copy()
results['predicted'] = np.expm1(lr.predict(x_test))
results['actual'] = np.expm1(y_test)
results = results[['predicted','actual']]
results['predicted'] = results['predicted'].round(2)

In [None]:
results