<a href="https://colab.research.google.com/github/sjcorp/notebooks/blob/master/ml_projects/ml_project_carprediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Data

In [30]:
# From Google Drive
gd_car_train = 'https://drive.google.com/file/d/1NI7dTxmekoaLR-4GHbmhSEw5_BDgq1S3/view?usp=sharing'
gd_car_test = 'https://drive.google.com/file/d/1lxwa3eK5T4XnU1A4qY730_q8m5-miV-e/view?usp=sharing'

In [31]:
cartrain = 'https://drive.google.com/uc?export=download&id='+gd_car_train.split('/')[-2]
cartest = 'https://drive.google.com/uc?export=download&id='+gd_car_test.split('/')[-2]

# Import Libraries

In [32]:
# Import Primary Libraries
import sys
import scipy
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import re
import matplotlib.pyplot as plt
%matplotlib inline

# Import ML Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Load Visualization Libraries
from pandas.plotting import scatter_matrix

# Load Data Preprocessing Libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# Load Data Evaluation Libraries
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [33]:
df1 = pd.read_csv(cartrain)
df2 = pd.read_csv(cartest)

# Exploratory Data Analysis

### Number of Columns

In [37]:
print(df1.columns)
print(df2.columns)

Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'New_Price', 'Price'],
      dtype='object')
Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'New_Price'],
      dtype='object')


### Data Type of Features

In [45]:
print(df1.dtypes)
print(df2.dtypes)

Name                  object
Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
New_Price             object
Price                float64
dtype: object
Name                  object
Location              object
Year                   int64
Kilometers_Driven      int64
Fuel_Type             object
Transmission          object
Owner_Type            object
Mileage               object
Engine                object
Power                 object
Seats                float64
New_Price             object
dtype: object


### Number of Observations

In [48]:
print(df1.shape)
print(df2.shape)

(6019, 13)
(1234, 12)


### Check for Missing Values

In [49]:
print(df1.isnull().sum())
print(df2.isnull().sum())

Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 0
Engine                  0
Power                   0
Seats                  43
New_Price            5195
Price                   0
dtype: int64
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 0
Engine                 10
Power                  10
Seats                  11
New_Price            1052
dtype: int64


### Explore Categorical Variables

In [56]:
#combining training set and test set data
all_brands = list(df1.Name) + list(df2.Name)
all_locations = list(df1.Location) + list(df2.Location)
all_fuel_types = list(df1.Fuel_Type) + list(df2.Fuel_Type)
all_transmissions = list(df1.Transmission) + list(df2.Transmission)
all_owner_types = list(df1.Owner_Type) + list(df2.Owner_Type)

print("\nNumber Of Unique Values In Name : \n ", len(set(all_brands)))
# print("\nThe Unique Values In Name : \n ", set(all_brands))

print("\nNumber Of Unique Values In Location : \n ", len(set(all_locations)))
print("\nThe Unique Values In Location : \n ", set(all_locations) )

print("\nNumber Of Unique Values In Fuel_Type : \n ", len(set(all_fuel_types)))
print("\nThe Unique Values In Fuel_Type : \n ", set(all_fuel_types) )

print("\nNumber Of Unique Values In Transmission : \n ", len(set(all_transmissions)))
print("\nThe Unique Values In Transmission : \n ", set(all_transmissions) )

print("\nNumber Of Unique Values In Owner_Type : \n ", len(set(all_owner_types)))
print("\nThe Unique Values In Owner_Type : \n" ,set(all_owner_types))


Number Of Unique Values In Name : 
  2041

Number Of Unique Values In Location : 
  11

The Unique Values In Location : 
  {'Ahmedabad', 'Mumbai', 'Chennai', 'Pune', 'Jaipur', 'Kolkata', 'Bangalore', 'Delhi', 'Hyderabad', 'Coimbatore', 'Kochi'}

Number Of Unique Values In Fuel_Type : 
  5

The Unique Values In Fuel_Type : 
  {'Diesel', 'LPG', 'Electric', 'CNG', 'Petrol'}

Number Of Unique Values In Transmission : 
  2

The Unique Values In Transmission : 
  {'Manual', 'Automatic'}

Number Of Unique Values In Owner_Type : 
  4

The Unique Values In Owner_Type : 
 {'First', 'Third', 'Fourth & Above', 'Second'}


# Data Cleaning

In [57]:
def restructure(data):
    
    # Splitting name into 2 features, brand and model
    
    names = list(data.Name)
    brand = []
    model = []

    for i in range(len(names)):
        try:
            brand.append(names[i].split(" ")[0])
            try:
                model.append(" ".join(names[i].split(" ")[1:]).strip())
            except:
                pass
        except:
            print("ERR ! - ", names[i], "@" , i)

    # Removing the  texts and converting to integer
    
    mileage = list(data.Mileage)

    for i in range(len(mileage)):
        try :
            mileage[i] = float(mileage[i].split(" ")[0].strip())
        except:
            mileage[i] = np.nan

    # Removing the  texts and converting to integer
    
    engine = list(data.Engine)
    for i in range(len(engine)):
        try :
            engine[i] = int(engine[i].split(" ")[0].strip())
        except:
            engine[i] = np.nan

    # Removing the texts and converting to integer
    
    power = list(data.Power)
    for i in range(len(power)):
        try :
            power[i] = float(power[i].split(" ")[0].strip())
        except:
            power[i] = np.nan

    # Filling New Price NaN values with 0 and converting everything to Lakhs
    
    data['New_Price'].fillna(0, inplace = True)

    newp = list(data['New_Price'])

    for i in range(len(newp)):
        if newp[i] == 0:
            newp[i] = float(newp[i])
            continue
        elif 'Cr' in newp[i]:
            newp[i] = float(newp[i].split()[0].strip()) * 100
        elif 'Lakh' in newp[i]:
            newp[i] = float(newp[i].split()[0].strip())

    #Re-ordering the columns

    restructured = pd.DataFrame({'Brand': brand,
    'Model':model,
    'Location': data['Location'],
    'Year':data['Year'] ,
    'Kilometers_Driven':data['Kilometers_Driven'],
    'Fuel_Type':data['Fuel_Type'],
    'Transmission':data['Transmission'],
    'Owner_Type':data['Owner_Type'],
    'Mileage':mileage,
    'Engine':engine,
    'Power':power,
    'Seats':data['Seats'],
    'New_Price':newp
    })

    if 'Price' in data.columns:
        restructured['Price'] = data['Price']
        return restructured

    else:
        return restructured

In [58]:
training_set = restructure(df1)
test_set = restructure(df1)

In [59]:
training_set.head()

Unnamed: 0,Brand,Model,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti,Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,0.0,1.75
1,Hyundai,Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,0.0,12.5
2,Honda,Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,8.61,4.5
3,Maruti,Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,0.0,6.0
4,Audi,A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,0.0,17.74


In [60]:
test_set.head()

Unnamed: 0,Brand,Model,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti,Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,0.0,1.75
1,Hyundai,Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,0.0,12.5
2,Honda,Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,8.61,4.5
3,Maruti,Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,0.0,6.0
4,Audi,A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,0.0,17.74


# Encoding Categorical Variables

### Finding Unique Categories

In [61]:
#'Brand', 'Model', 'Location','Fuel_Type', 'Transmission', 'Owner_Type'

all_brands = list(set(list(training_set.Brand) + list(test_set.Brand)))
all_models = list(set(list(training_set.Model) + list(test_set.Model)))
all_locations = list(set(list(training_set.Location) + list(test_set.Location)))
all_fuel_types = list(set(list(training_set.Fuel_Type) + list(test_set.Fuel_Type)))
all_transmissions = list(set(list(training_set.Transmission) + list(test_set.Transmission)))
all_owner_types = list(set(list(training_set.Owner_Type) + list(test_set.Owner_Type)))

### Initializing Label Encoders & Fitting Categories

In [62]:
#Initializing label encoders
from sklearn.preprocessing import LabelEncoder
le_brands = LabelEncoder()
le_models = LabelEncoder()
le_locations = LabelEncoder()
le_fuel_types = LabelEncoder()
le_transmissions = LabelEncoder()
le_owner_types = LabelEncoder()

#Fitting the categories
le_brands.fit(all_brands)
le_models.fit(all_models)
le_locations.fit(all_locations)
le_fuel_types.fit(all_fuel_types)
le_transmissions.fit(all_transmissions)
le_owner_types.fit(all_owner_types)

LabelEncoder()

# Transforming the Data in Training & test Set

In [63]:
#Applying encoding to Training_set data
training_set['Brand'] = le_brands.transform(training_set['Brand'])
training_set['Model'] = le_models.transform(training_set['Model'])
training_set['Location'] = le_locations.transform(training_set['Location'])
training_set['Fuel_Type'] = le_fuel_types.transform(training_set['Fuel_Type'])
training_set['Transmission'] = le_transmissions.transform(training_set['Transmission'])
training_set['Owner_Type'] = le_owner_types.transform(training_set['Owner_Type'])

#Applying encoding to Test_set data
test_set['Brand'] = le_brands.transform(test_set['Brand'])
test_set['Model'] = le_models.transform(test_set['Model'])
test_set['Location'] = le_locations.transform(test_set['Location'])
test_set['Fuel_Type'] = le_fuel_types.transform(test_set['Fuel_Type'])
test_set['Transmission'] = le_transmissions.transform(test_set['Transmission'])
test_set['Owner_Type'] = le_owner_types.transform(test_set['Owner_Type'])

In [64]:
training_set.head()

Unnamed: 0,Brand,Model,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,18,1643,9,2010,72000,0,1,0,26.6,998.0,58.16,5.0,0.0,1.75
1,10,460,10,2015,41000,1,1,0,19.67,1582.0,126.2,5.0,0.0,12.5
2,9,911,2,2011,46000,4,1,0,18.2,1199.0,88.7,5.0,8.61,4.5
3,18,620,2,2012,87000,1,1,0,20.77,1248.0,88.76,7.0,0.0,6.0
4,1,96,3,2013,40670,1,0,2,15.2,1968.0,140.8,5.0,0.0,17.74


In [65]:
test_set.head()

Unnamed: 0,Brand,Model,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,18,1643,9,2010,72000,0,1,0,26.6,998.0,58.16,5.0,0.0,1.75
1,10,460,10,2015,41000,1,1,0,19.67,1582.0,126.2,5.0,0.0,12.5
2,9,911,2,2011,46000,4,1,0,18.2,1199.0,88.7,5.0,8.61,4.5
3,18,620,2,2012,87000,1,1,0,20.77,1248.0,88.76,7.0,0.0,6.0
4,1,96,3,2013,40670,1,0,2,15.2,1968.0,140.8,5.0,0.0,17.74


# Imputing Missing Values

### Classifying Predictors and Target (Define X and Y)

In [66]:
# Dependent Variable
Y_train_data = training_set.iloc[:, -1].values

# Independent Variables
X_train_data = training_set.iloc[:,0 : -1].values

# Independent Variables for test Set
X_test = test_set.iloc[:,:].values

### Initializing & Fitting Imputer

In [67]:
from sklearn.impute import SimpleImputer

#Training Set Imputation
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imputer = imputer.fit(X_train_data[:,8:12])
X_train_data[:,8:12] = imputer.transform(X_train_data[:,8:12])

#Test_set Imputation
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imputer = imputer.fit(X_test[:,8:12])
X_test[:,8:12] = imputer.transform(X_test[:,8:12])

# Train Test Split

In [68]:
from sklearn.model_selection import train_test_split

#Splitting the training set into Training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train_data, Y_train_data, test_size = 0.2, random_state = 1)

# Feature Scaling

In [69]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

#Scaling Original Training Data
X_train_data = sc.fit_transform(X_train_data)

#Reshaping vector to array for transforming
Y_train_data = Y_train_data.reshape((len(Y_train_data), 1))
Y_train_data = sc.fit_transform(Y_train_data)
#converting back to vector
Y_train_data = Y_train_data.ravel()

X_test = sc.transform(X_test)

# Scaling Splitted training and val sets
X_train = sc.fit_transform(X_train)
X_val = sc.fit_transform(X_val)

#Reshaping vector to array for transforming
Y_train = Y_train.reshape((len(Y_train), 1))
Y_train = sc.fit_transform(Y_train)
#converting back to vector
Y_train = Y_train.ravel()

# Modeling & Predicting

### Calculating Accuracy with RMLSE

In [70]:
# Score Calculation
def score(y_pred, y_true):
   error = np.square(np.log10(y_pred +1) - np.log10(y_true +1)).mean() ** 0.5
   score = 1 - error
   return score

#The actual recordings to be tested against
y_true = Y_val

### Testing the Model on Validation Sets

In [71]:
#Initializing Linear regressor
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

#Fitting the regressor with training data
lr.fit(X_train,Y_train)

#Predicting the target(Price) for predictors in validation set X_val
Y_pred = sc.inverse_transform(lr.predict(X_val))

#Eliminating negative values in prediction for score calculation
for i in range(len(Y_pred)):
   if Y_pred[i] < 0:
       Y_pred[i] = 0

#Printing the score for validation sets
print("\n\n Linear Regression SCORE : ", score(Y_pred, y_true))



 Linear Regression SCORE :  0.7709543691220707


### Predicting the Price for Test Set

In [75]:
#Initializing a new regressor
lr2 = LinearRegression()

#Fitting the regressor with complete training data(X_train_data,Y_train_data)
lr2.fit(X_train_data,Y_train_data)

#Predicting the target(Price) for predictors in the test data
Y_pred2 = sc.inverse_transform(lr2.predict(X_test))

#Eliminating negative values in prediction for score calculation
# for i in range(len(Y_pred2)):
#    if Y_pred2[i] < 0:
#        Y_pred2[i] = 0

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 13 is different from 14)