In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from scipy import stats
from scipy.stats import norm, skew #for some statistics
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score 
import seaborn as sns  #advanced visualization library
import requests, zipfile, io
import warnings
warnings.filterwarnings('ignore')


In [None]:
train = pd.read_csv('/content/drive/My Drive/DATACEPT COMP4/train.csv')
test = pd.read_csv('/content/drive/My Drive/DATACEPT COMP4/test.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/DATACEPT COMP4/sample_submission.csv')

#Data exploration and cleaning

In [None]:
train.shape

(4815, 13)

In [None]:
train.head()

Unnamed: 0,ID,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,1579,Hyundai Grand i10 Asta Option,Bangalore,2015,50000,Petrol,Manual,First,18.9 kmpl,1197 CC,82 bhp,5.0,5.75
1,4632,Hyundai Verna SX CRDi AT,Pune,2012,83000,Diesel,Automatic,First,19.08 kmpl,1582 CC,126.32 bhp,5.0,6.0
2,4455,Fiat Avventura MULTIJET Emotion,Hyderabad,2015,57266,Diesel,Manual,First,20.5 kmpl,1248 CC,91.72 bhp,5.0,6.5
3,2617,Maruti Swift Dzire ZXi,Ahmedabad,2012,55002,Petrol,Manual,First,17.5 kmpl,1298 CC,85.8 bhp,5.0,4.25
4,1714,Ford Figo Diesel EXI,Ahmedabad,2012,55005,Diesel,Manual,First,20.0 kmpl,1399 CC,68 bhp,5.0,2.75


In [None]:
missing_percentage=(train.isna().sum()/len(train)).sort_values(ascending=False)
missing_percentage

Seats                0.007892
Power                0.006854
Engine               0.006854
Mileage              0.000415
Price                0.000000
Owner_Type           0.000000
Transmission         0.000000
Fuel_Type            0.000000
Kilometers_Driven    0.000000
Year                 0.000000
Location             0.000000
Name                 0.000000
ID                   0.000000
dtype: float64

In [None]:
ids=test.ID

In [None]:
train=train.dropna()
Y=train.Price

In [None]:
all_data = pd.concat((train, test)).reset_index(drop = True)


In [None]:
#Extracting the name from the whole model

def f(x):
  return x.split()[0]
all_data["Name"]=all_data["Name"].apply(f)



In [None]:
#Grouping by Location mean price

new=train.groupby('Location')['Price'].agg({'price_per_Location':'mean'})
all_data = pd.merge(all_data,new,how='left',left_on='Location',right_on='Location')

In [None]:
train= all_data[:len(Y)]
test= all_data[len(Y):]


In [None]:
L=train['Engine'].str.extract(r'(?P<digit>(\d+))')["digit"]


In [None]:
L=L.tolist()
train["Engine"]=L
train["Engine"]=pd.to_numeric(train["Engine"])

In [None]:
L=train['Mileage'].str.extract(r'(?P<digit>(\d+.\d+))')["digit"]
L=L.tolist()
train["Mileage"]=L
train["Mileage"]=pd.to_numeric(train["Mileage"])

In [None]:
L=train['Power'].str.extract(r'(?P<digit>(\d+))')["digit"]
L=L.tolist()
train["Power"]=L
train["Power"]=pd.to_numeric(train["Power"])

In [None]:
Y

0        5.75
1        6.00
2        6.50
3        4.25
4        2.75
        ...  
4810     2.85
4811     3.41
4812     2.10
4813    19.64
4814     2.75
Name: Price, Length: 4775, dtype: float64

In [None]:
to_drop=["ID","Price"]

In [None]:
train=train.drop(to_drop,axis=1)

In [None]:
train=pd.get_dummies(train)

In [None]:
#There are 6 car names that are not in the test set, therefore I found them using set difference (ensemble A-ensemble B) then I eliminated them
s=set(train.Name.unique())
s2=set(test.Name.unique())
s=s-s2
print(s)


In [None]:
s=['Name_Lamborghini', 'Name_Force', 'Name_Bentley', 'Name_Smart', 'Name_Isuzu', 'Name_Ambassador']

In [None]:
train=train.drop(s,axis=1)

#Modelling

In [None]:
train_X, val_X, train_y, val_y = train_test_split(train,Y , test_size=0.20, random_state=4)

In [None]:
train.shape

(4775, 52)

In [None]:
import xgboost as xgb
xgb_model =xgb.XGBRegressor(gamma=0, 
                             learning_rate=0.1, max_depth=8, 
                             n_estimators=500,
                             tree_method='gpu_hist' )
xgb_model=xgb_model.fit(train_X, train_y)
val_pred_xgb = xgb_model.predict(val_X)
val_mae_xgb = np.sqrt(mean_squared_error(val_pred_xgb, val_y))
print(val_mae_xgb)


2.6289716642060705


In [None]:
import lightgbm as lgb
lgb_model = lgb.LGBMRegressor(colsample_bytree=0.4,
                 learning_rate=0.1,
                 max_depth=8,
                 min_child_weight=1.5,
                 n_estimators=500,                                                                   
                 reg_alpha=0.45,
                 reg_lambda=0.75,
                 subsample=0.6,
                 seed=42)
lgb_model.fit(train_X,train_y)
val_pred_lgb = lgb_model.predict(val_X)
val_mae_lgb = np.sqrt(mean_squared_error(val_pred_lgb, val_y))
print(val_mae_lgb)

3.0097721130456074


In [None]:
#Execute only once
pip install catboost

In [None]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(n_estimators=500,
                          learning_rate=0.1,
                          depth=8)
model.fit(train_X,train_y)
val_pred = model.predict(val_X)
val_mae = np.sqrt(mean_squared_error(val_pred, val_y))
print(val_mae)

0:	learn: 10.5284302	total: 4.6ms	remaining: 2.29s
1:	learn: 9.8448724	total: 8.7ms	remaining: 2.17s
2:	learn: 9.2375161	total: 12.6ms	remaining: 2.09s
3:	learn: 8.6912379	total: 16.5ms	remaining: 2.04s
4:	learn: 8.2116652	total: 20.3ms	remaining: 2.01s
5:	learn: 7.7657699	total: 24ms	remaining: 1.98s
6:	learn: 7.3946130	total: 27.9ms	remaining: 1.96s
7:	learn: 7.0478327	total: 31.8ms	remaining: 1.96s
8:	learn: 6.6761544	total: 35.6ms	remaining: 1.94s
9:	learn: 6.3629844	total: 39.4ms	remaining: 1.93s
10:	learn: 6.0797727	total: 43.4ms	remaining: 1.93s
11:	learn: 5.8347311	total: 47.4ms	remaining: 1.93s
12:	learn: 5.5969326	total: 51.3ms	remaining: 1.92s
13:	learn: 5.4075417	total: 55.1ms	remaining: 1.91s
14:	learn: 5.1946666	total: 59ms	remaining: 1.91s
15:	learn: 5.0215398	total: 63ms	remaining: 1.91s
16:	learn: 4.8636627	total: 66.8ms	remaining: 1.9s
17:	learn: 4.7252373	total: 70.5ms	remaining: 1.89s
18:	learn: 4.5878788	total: 74.3ms	remaining: 1.88s
19:	learn: 4.4747174	total: 77

###Dealing with the test set

In [None]:
missing_percentage=(test.isna().sum()/len(test)).sort_values(ascending=False)
missing_percentage

Price                 1.000000
Seats                 0.003322
Power                 0.002492
Engine                0.002492
price_per_Location    0.000000
Year                  0.000000
Transmission          0.000000
Owner_Type            0.000000
Name                  0.000000
Mileage               0.000000
Location              0.000000
Kilometers_Driven     0.000000
ID                    0.000000
Fuel_Type             0.000000
dtype: float64

In [None]:
#How I discovered that the test set has less car company names than the training set (53 to 59)
test.Name.unique()

array(['Mercedes-Benz', 'Maruti', 'Ford', 'Honda', 'Volkswagen',
       'Renault', 'Hyundai', 'Chevrolet', 'Audi', 'Mahindra', 'Toyota',
       'Skoda', 'BMW', 'Land', 'Mini', 'Mitsubishi', 'Tata', 'Jeep',
       'Jaguar', 'ISUZU', 'Porsche', 'Nissan', 'Datsun', 'Fiat', 'Volvo'],
      dtype=object)

In [None]:
L=test['Mileage'].str.extract(r'(?P<digit>(\d+.\d+))')["digit"]
L=L.tolist()
test["Mileage"]=L
test["Mileage"]=pd.to_numeric(test["Mileage"])

In [None]:
L=test['Engine'].str.extract(r'(?P<digit>(\d+))')["digit"]
L=L.tolist()
test["Engine"]=L
test["Engine"]=pd.to_numeric(test["Engine"])

In [None]:
L=test['Power'].str.extract(r'(?P<digit>(\d+))')["digit"]
L=L.tolist()
test["Power"]=L
test["Power"]=pd.to_numeric(test["Power"])

In [None]:
to_drop2=["ID","Price"]

In [None]:
test=test.drop(to_drop2,axis=1)

In [None]:
test=pd.get_dummies(test)

In [None]:
test.shape

(1204, 52)

#Testing

In [None]:
# make predictions which we will submit. 
test_preds_xgb = xgb_model.predict(test)

# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.
output = pd.DataFrame({'ID': ids, 'Price': (test_preds_xgb)})
output.to_csv('/content/drive/My Drive/DATACEPT COMP4/sub_xgb.csv', index=False)

In [None]:
# make predictions which we will submit. 
test_preds_lgb = lgb_model.predict(test)

# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.
output = pd.DataFrame({'ID': ids, 'price': (test_preds_lgb)})
output.to_csv('/content/drive/My Drive/DATACEPT COMP4/sub_lgb.csv', index=False)

### Best model

In [None]:
# make predictions which we will submit. 
test_preds_m5alet = 0.3*test_preds_lgb+0.7*test_preds

# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.
output = pd.DataFrame({'ID': ids, 'price': (test_preds_m5alet)})
output.to_csv('/content/drive/My Drive/DATACEPT COMP4/m5alet.csv', index=False)

In [None]:
# make predictions which we will submit. 
test_preds = model.predict(test)

# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.
output = pd.DataFrame({'ID': ids, 'Price': (test_preds)})
output.to_csv('/content/drive/My Drive/DATACEPT COMP4/sub_cat.csv', index=False)