<a href="https://colab.research.google.com/github/shirart/data_science_proj_1/blob/main/data_science_project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns  #boxplot
import matplotlib.pyplot as plt
from sklearn import preprocessing

data = pd.read_csv('sample_data/train-data.csv')
filtered_data = data.drop(['New_Price','Unnamed: 0'] ,axis=1) # unnamed represents the index - unnecessary

# remove units from data
filtered_data['Mileage'] = filtered_data['Mileage'].str.strip('km/kgpl ')
filtered_data['Engine'] = filtered_data['Engine'].str.strip('CC ')
filtered_data['Power'] = filtered_data['Power'].str.strip('bhp ')

# removed high kilometers_driven values as it exceeded the standard range (over 1 milion)
filtered_data['Kilometers_Driven']=filtered_data['Kilometers_Driven'].where(filtered_data['Kilometers_Driven']<1*10**6)

data

In [None]:
data.info()

In [None]:
filtered_data

In [None]:
filtered_data.describe()

In [None]:
filtered_data.describe(include=['O'])

In [None]:
filtered_data["Name"].value_counts()


In [None]:
import collections
collections.Counter(list(filtered_data["Name"].value_counts().to_numpy()))

In [None]:
filtered_data["Location"].value_counts()

In [None]:
filtered_data["Fuel_Type"].value_counts()

In [None]:
# adjust the information for creating models (strings to values)
filtered_data= filtered_data.replace("null", np.nan)
filtered_data= filtered_data.replace("First", 1)
filtered_data= filtered_data.replace("Second", 2)
filtered_data= filtered_data.replace("Third", 3)
filtered_data= filtered_data.replace("Fourth & Above", 4)
filtered_data['Transmission'] = (filtered_data['Transmission'] == 'Manual').astype(float)

filtered_data['Name'] = filtered_data['Name'].apply(lambda x: x.split(" ")[0]) 
#print(filtered_data)

# label (simple) encoding:
le = preprocessing.LabelEncoder()
filtered_data['Fuel_Type']= le.fit_transform(filtered_data['Fuel_Type'])
filtered_data['Location']= le.fit_transform(filtered_data['Location'])
filtered_data['Name']= le.fit_transform(filtered_data['Name'])


filtered_data = filtered_data.astype(float)
filtered_data.describe()

In [None]:
filtered_data.isnull().sum() #* 100 / len(filtered_data)

In [None]:
####################################################################################################################
# graphic presentation for each feature
####################################################################################################################

items=['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Transmission','Fuel_Type','Name','Owner_Type','Location','Price']
fig, axes = plt.subplots(nrows=len(items), figsize=(15,50))
ranges = [None, [filtered_data['Kilometers_Driven'].min(), filtered_data['Kilometers_Driven'].max()], None, [0,5000], None, [0, 9], None,None,None,None,None,[0,80]]
for i in range(len(items)):
  _ = axes[i].hist(filtered_data[items[i]], bins=50,range=ranges[i])
  axes[i].set_title(items[i], fontsize=16)
  
plt.show()

In [None]:
items=['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Transmission','Fuel_Type','Name','Owner_Type','Location']
fig, axes = plt.subplots(nrows=len(items), figsize=(10,30))
ranges = [None, [filtered_data['Kilometers_Driven'].min(), filtered_data['Kilometers_Driven'].max()], None, None, None, [0, 9], None,None,None,None,None,None]
for i in range(len(items)):
  filtered_data.iloc[:,:-1].boxplot(column=[items[i]], ax=axes[i], fontsize = 16)

plt.show()

In [None]:
corr_matrix = filtered_data.corr()
fig, ax = plt.subplots(figsize=(9,8))
sns.heatmap(corr_matrix, annot=True,ax=ax)
plt.show()

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.tree import DecisionTreeRegressor, export_graphviz, plot_tree
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from tqdm import tqdm
from scipy import stats

####################################################################################################################
#preparing data:
####################################################################################################################

# filter out NaN values - option 1
filtered_data = filtered_data.dropna()

In [5]:
#fill nan's with SimpleImputer - option 2
imp=SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(filtered_data.iloc[:,:-1])
filtered_data = pd.DataFrame(imp.transform(filtered_data.iloc[:,:-1]))

In [None]:
# fill nan's regarding to correlation - option 3 
# filter out NaN values
filtered_data = filtered_data.dropna(subset=['Kilometers_Driven','Mileage',"Engine", "Seats"])
print(filtered_data.isnull().sum())
nan_indices = pd.isnull(filtered_data).to_numpy().any(1).nonzero()[0]

engine_values = []
engine_to_power = {}

# indices of power's nan values
for i in nan_indices:
  # finding engine values matching power nans
  engine_values.append(filtered_data.iloc[i,:]["Engine"])

for engine in set(engine_values):
  # finding power values which aren't nans matching values in engine_values
  engine_to_power[engine] = []
  for j in range(len(filtered_data["Engine"])):
      if filtered_data.iloc[j,:]["Engine"] == engine and not np.isnan(filtered_data.iloc[j,:]["Power"]):
        engine_to_power[engine].append(filtered_data.iloc[j,:]["Power"])
  if not engine_to_power[engine]:
    engine_to_power[engine] = np.nan
  else:
    engine_to_power[engine] = np.mean(engine_to_power[engine])

# fill power with mean of known power values matching engine key (noticed that there are few values for the same key)
for l in nan_indices:
  engine = filtered_data.iloc[l,:]["Engine"]
  # indicea of dataframe do not correspond to indices of Series (single column)
  filtered_data.iloc[l,:]["Power"] = engine_to_power[engine]

filtered_data = filtered_data.dropna()
print(filtered_data.isnull().sum())

In [14]:
####################################################################################################################
# split data:
#################################################################################################################### 
X = filtered_data.iloc[:,:-1]  #iloc - get part of the columes (or rows)
y = filtered_data["Price"]

In [15]:
##########################################################################################
def k_fold_validator(regressor, X_train_val, y_train_val, k):
  return np.mean(cross_val_score(regressor, X_train_val, y_train_val, cv=k, n_jobs=-1, scoring='neg_mean_squared_error'))

def ten_fold_validator(regressor, X_train_val, y_train_val):
  return k_fold_validator(regressor, X_train_val, y_train_val, 10)

def llo_validator(regressor, X_train_val, y_train_val):
  return k_fold_validator(regressor, X_train_val, y_train_val, X_train_val.shape[0])

def holdout_validator(regressor, X_train_val, y_train_val):
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)
  regressor.fit(X_train, y_train)
  return regressor.score(X_val, y_val)
##########################################################################################

def get_optimal_tree(X_train_val, X_test, y_train_val, y_test, validator_func):
  best_hyp = None
  best_score = None
  optimal_tree = None
  for max_depth in tqdm(range(5, 15)):
   for min_samples_split in range(2, 10):
    for min_samples_leaf in range(1, 5):
     for criterion in ('mse', 'friedman_mse', 'mae'):
       params = {'criterion': criterion, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split}
       tree = DecisionTreeRegressor(**params)
       score = validator_func(tree, X_train_val, y_train_val)
       if best_score is None or score > best_score:
          best_hyp = params
          best_score = score
          optimal_tree = tree
  optimal_tree.fit(X_train_val, y_train_val)
  train_best_score =  optimal_tree.score(X_train_val, y_train_val)
  print(f'Best score wrt. training: { train_best_score}')
  best_score =  optimal_tree.score(X_test, y_test)
  print(f'Best score wrt. test: {best_score}')
  return optimal_tree, best_score, train_best_score, best_hyp
  


In [16]:
####################################################################################################################
#model 1 - DecisionTreeRegressor:
####################################################################################################################

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

validator_names = ["10 Fold", "Holdout"]
validators = [ten_fold_validator, holdout_validator]
optimal_trees = []
best_scores = []
train_best_scores = []
best_hyps = []

for i in range(len(validators)):
  print(f'Validating using {validator_names[i]}')
  optimal_tree, best_score, train_best_score, best_hyp = get_optimal_tree(X_train_val, X_test, y_train_val, y_test, validators[i])
  optimal_trees.append(optimal_tree)
  best_scores.append(best_score)
  train_best_scores.append(train_best_score)
  best_hyps.append(best_hyp)

best_score_index = np.argmax(best_scores)
optimal_tree = optimal_trees[best_score_index]
best_hyp = best_hyps[best_score_index]
print(f'Best params: {best_hyp}')

llo_score = llo_validator(DecisionTreeRegressor(**best_hyp), X_train_val, y_train_val)
print(f'LLO score: {llo_score}')


  0%|          | 0/10 [00:00<?, ?it/s]

Validating using 10 Fold


100%|██████████| 10/10 [33:43<00:00, 202.30s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Best score wrt. training: 0.9092472296139016
Best score wrt. test: 0.7873657138133241
Validating using Holdout


100%|██████████| 10/10 [05:07<00:00, 30.70s/it]

Best score wrt. training: 0.9602598368844366
Best score wrt. test: 0.876637067205146
Best params: {'criterion': 'friedman_mse', 'max_depth': 12, 'min_samples_leaf': 2, 'min_samples_split': 8}





LLO score: -23.59125112793383


In [None]:
import graphviz
from IPython.display import SVG, display

# first node corresponds to feature with highest correlation to price (the feature is: power)
graph = graphviz.Source(export_graphviz(optimal_tree))
display(SVG(graph.pipe(format='svg')))


In [17]:
####################################################################################################################
def get_optimal_ridge(X, y, validator_func):
  X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
  best_hyp = None
  best_score = None
  optimal_rid = None
  for alpha in tqdm(range(1, 1000, 100)):
       rid = Ridge(alpha=alpha/1000.0)
       score = validator_func(rid, X_train_val, y_train_val)
       if best_score is None or score > best_score:
          best_hyp = alpha/1000.0
          best_score = score
          optimal_rid = rid
  optimal_rid.fit(X_train_val, y_train_val)
  train_best_score =  optimal_rid.score(X_train_val, y_train_val)
  print(f'Best score wrt. training: {train_best_score}')
  best_score =  optimal_rid.score(X_test, y_test)
  print(f'Best score wrt. test: {best_score}')
  return optimal_rid, best_score, train_best_score, best_hyp, y_test, X_test

In [18]:
####################################################################################################################
#model 2 - Ridge:
####################################################################################################################

validator_names = ["10 Fold", "LLO", "Holdout"]
validators = [ten_fold_validator, llo_validator, holdout_validator]
optimal_ridges = []
best_scores_r = []
train_best_scores_r = []
best_hyps = []
for i in range(len(validators)):
  print(f'Validating using {validator_names[i]}')
  optimal_ridge, best_score, train_best_score, best_hyp_r, y_test_r, X_test_r = get_optimal_ridge(X, y, validators[i])
  optimal_ridges.append(optimal_ridge)
  best_scores_r.append(best_score)
  train_best_scores_r.append(train_best_score)
  best_hyps.append(best_hyp_r)

best_score_index = np.argmax(best_scores_r)
optimal_ridge = optimal_ridges[best_score_index]
best_hyp = best_hyps[best_score_index]
print(f'Best alpha: {best_hyp}')


 10%|█         | 1/10 [00:00<00:00,  9.69it/s]

Validating using 10 Fold


100%|██████████| 10/10 [00:00<00:00, 12.42it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Best score wrt. training: 0.7060644607543024
Best score wrt. test: 0.694945249787507
Validating using LLO


100%|██████████| 10/10 [02:55<00:00, 17.54s/it]
100%|██████████| 10/10 [00:00<00:00, 159.15it/s]

Best score wrt. training: 0.7060644607543024
Best score wrt. test: 0.694945249787507
Validating using Holdout
Best score wrt. training: 0.7060644607543024
Best score wrt. test: 0.694945249787507
Best alpha: 0.901





In [19]:
####################################################################################################################
# mse - models:
####################################################################################################################

from sklearn.metrics import mean_squared_error

y_true = y_test
y_pred = optimal_tree.predict(X_test)

# compute mse manually
errors = y_pred-y_test
errors = [x**2 for x in errors]
mse = np.mean(errors)
print(mse)

# compute mse automatically (with sklearn)
mse = mean_squared_error(y_true, y_pred)
print(mse)

15.120717305255404
15.120717305255404


In [20]:
from sklearn.metrics import mean_squared_error
y_true_r = y_test_r
y_pred_r = optimal_ridge.predict(X_test_r)

mse_r = mean_squared_error(y_true_r , y_pred_r)
print(mse_r)

37.39086398228707


In [21]:
####################################################################################################################
# comparing models:
####################################################################################################################
from IPython.display import display, HTML

print("Train results:")
df = pd.DataFrame({"10-Fold": [train_best_scores[0], train_best_scores_r[0]], "LLO": [None, train_best_scores_r[1]], "Holdout": [train_best_scores[1], train_best_scores_r[2]]}, index=['Decision Tree Regressor', 'Ridge'])
display(HTML(df.to_html()))

# The MSE is computed with respect to the optimal hyper-parameters on the test set
print("Test results:")
df = pd.DataFrame({"10-Fold": [best_scores[0], best_scores_r[0]], "LLO": [None, best_scores_r[1]], "Holdout": [best_scores[1], best_scores_r[2]], "MSE": [mse, mse_r]}, index=['Decision Tree Regressor', 'Ridge'])
display(HTML(df.to_html()))


Train results:


Unnamed: 0,10-Fold,LLO,Holdout
Decision Tree Regressor,0.909247,,0.96026
Ridge,0.706064,0.706064,0.706064


Test results:


Unnamed: 0,10-Fold,LLO,Holdout,MSE
Decision Tree Regressor,0.787366,,0.876637,15.120717
Ridge,0.694945,0.694945,0.694945,37.390864
