In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
# df = pd.read_csv("training_cleaned.csv")
df = pd.read_csv("D:/DM_Project_Dataset/training_cleaned.csv")
# df.drop(columns=['AuctionAVG','RetailAVG','Trend'], inplace=True)
df.head()

Unnamed: 0,RefId,IsBadBuy,PurchDate,Auction,VehicleAge,Make,Model,SubModel,Color,Transmission,...,TopThreeAmericanName,BYRNO,VehBCost,IsOnlineSale,WarrantyCost,IsBase,Region,AuctionAVG,RetailAVG,Trend
0,326,1,2010-10-25,ADESA,2,KIA,SPECTRA,MEDIUM,BLUE,AUTO,...,OTHER,5546,6100.0,0,533,NO,SAT,7128.5,10300.0,-0.012594
1,42991,0,2009-05-27,MANHEIM,3,DODGE,STRATUS V6 2.7L V6 M,MEDIUM,SILVER,AUTO,...,CHRYSLER,99750,4000.0,0,1630,NO,SAT,5970.5,6948.0,0.0
2,55273,0,2010-03-18,OTHER,2,DODGE,CALIBER,MEDIUM,BLACK,AUTO,...,CHRYSLER,99761,7500.0,0,693,NO,WSC,8061.5,11199.0,-0.022092
3,29058,0,2009-06-08,OTHER,4,FORD,FREESTAR FWD V6 3.9L,VAN,BROWN,AUTO,...,FORD,99761,4725.0,0,1633,NO,MNT,5737.5,6696.5,0.0
4,34991,0,2009-03-04,MANHEIM,4,CHRYSLER,TOWN & COUNTRY FWD V,VAN,BLUE,AUTO,...,CHRYSLER,20833,5670.0,0,1623,YES,WSC,6061.0,7046.0,0.004692


## Normalisation and Encoding

In [3]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Identify the string columns
string_cols = df.select_dtypes(include='object').columns.tolist()

# Loop over each string column and encode it using a LabelEncoder
for col in string_cols:
    if col in df.columns:
        label_encoder = LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col])
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_norm = scaler.fit_transform(df)

# K-means

In [5]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_norm)

# Get the cluster labels for each data point
labels = kmeans.labels_

# Get the importance of each feature based on the k-means clustering
importance = kmeans.cluster_centers_.std(axis=0)

In [6]:
# Get the indices of the most important features
top_n_indices = importance.argsort()[::-1][:6]

# Get the column names for the most important features
top_n_features = df.columns[top_n_indices].tolist()

df_selected = df[top_n_features]
df_selected.columns.tolist()

['Nationality',
 'TopThreeAmericanName',
 'Make',
 'RetailAVG',
 'AuctionAVG',
 'VehBCost']

Testing accuracy with decision tree

In [7]:
X = df_selected
y = df['IsBadBuy']

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = {
    'criterion' : ['gini', 'entropy'],
    'max_depth': [2, 5, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 5, 10, 20],
}

#scores = ['precision', 'recall', 'f1']
scores = ['recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for ----> %s" % score)
    print()
    
    obj = tree.DecisionTreeClassifier()
    
    if (score == "recall"):
      clf = GridSearchCV(obj, tuned_parameters, cv=5, scoring=score)
    else:
      clf = GridSearchCV(obj, tuned_parameters, cv=5, scoring='%s_macro' % score)
                         
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for ----> recall

Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}

Grid scores on development set:

0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 20}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 10}
0.000 (+/-0.001) for {'criterion': 'gini',

Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 5}

Grid scores on development set:

0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 20}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 10}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min

# Density based clustering

In [8]:
from sklearn.cluster import DBSCAN

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(df_norm)

# Get the cluster labels for each data point
labels = dbscan.labels_

# Compute the importance of each feature based on the DBSCAN clustering
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
importance = np.zeros(df_norm.shape[1])
for i in range(df_norm.shape[1]):
    feature_values = df_norm[:,i]
    feature_importance = np.zeros(n_clusters)
    for j in range(n_clusters):
        cluster_mask = (labels == j)
        n_points_in_cluster = np.sum(cluster_mask)
        if n_points_in_cluster > 0:
            cluster_mean = np.mean(feature_values[cluster_mask])
            cluster_std = np.std(feature_values[cluster_mask])
            feature_importance[j] = cluster_std / cluster_mean
    importance[i] = np.mean(feature_importance)

# Get the indices of the most important features
top_n_indices = importance.argsort()[::-1][:6]

# Get the column names for the most important features
top_n_features = df.columns[top_n_indices].tolist()

df_selected = df[top_n_features]

df_selected.columns.tolist()

['AuctionAVG', 'Color', 'RetailAVG', 'VehBCost', 'PurchDate', 'RefId']

Testing accuracy with decision tree

In [10]:
X = df_selected
y = df['IsBadBuy']

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = {
    'criterion' : ['gini', 'entropy'],
    'max_depth': [2, 5, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 5, 10, 20],
}

#scores = ['precision', 'recall', 'f1']
scores = ['recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for ----> %s" % score)
    print()
    
    obj = tree.DecisionTreeClassifier()
    
    if (score == "recall"):
      clf = GridSearchCV(obj, tuned_parameters, cv=5, scoring=score)
    else:
      clf = GridSearchCV(obj, tuned_parameters, cv=5, scoring='%s_macro' % score)
                         
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for ----> recall

Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}

Grid scores on development set:

0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 20}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.000 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 10}
0.000 (+/-0.001) for {'criterion': 'gini',

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10}

Grid scores on development set:

0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 20}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 10}
0.468 (+/-0.001) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 

# Based on charts and correlation

In [12]:
# df = pd.read_csv("D:/DM_Project_Dataset/training_cleaned.csv")
df_clas = df.drop(columns=['RefId',
        'PurchDate',
        'IsBadBuy',
        'Auction',
        'Make',
        'Model',
        'WheelType',
        'Nationality',
        'Size',
        'Color',
        'Transmission',
        'BYRNO',
        'TopThreeAmericanName',
        'SubModel',
        'VehicleAge',
        'IsOnlineSale',
        'Region',
        'IsBase',])
df_clas.dropna(inplace=True)
len(df_clas)
df_clas.head()


Unnamed: 0,VehOdo,VehBCost,WarrantyCost,AuctionAVG,RetailAVG,Trend
0,61184,6100.0,533,7128.5,10300.0,-0.012594
1,92383,4000.0,1630,5970.5,6948.0,0.0
2,45965,7500.0,693,8061.5,11199.0,-0.022092
3,80169,4725.0,1633,5737.5,6696.5,0.0
4,77372,5670.0,1623,6061.0,7046.0,0.004692


In [13]:
X = df_clas
y = df['IsBadBuy']

In [14]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = {
    'criterion' : ['gini', 'entropy'],
    'max_depth': [2, 5, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 5, 10, 20],
}

#scores = ['precision', 'recall', 'f1']
scores = ['recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for ----> %s" % score)
    print()
    
    obj = tree.DecisionTreeClassifier()
    
    if (score == "recall"):
      clf = GridSearchCV(obj, tuned_parameters, cv=5, scoring=score)
    else:
      clf = GridSearchCV(obj, tuned_parameters, cv=5, scoring='%s_macro' % score)
                         
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for ----> recall

Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}

Grid scores on development set:

0.000 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.000 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.000 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.000 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 20}
0.000 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
0.000 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.000 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 10}
0.000 (+/-0.000) for {'criterion': 'gini',

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 5}

Grid scores on development set:

0.467 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.467 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.467 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.467 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 20}
0.467 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
0.467 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.467 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 10}
0.467 (+/-0.000) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, '

# Hierarchial Clustering

In [15]:
# from sklearn.cluster import AgglomerativeClustering
# from sklearn.utils import shuffle

# # Select a random sample of the data
# n_samples = 10000
# df_norm_sample = shuffle(df_norm, n_samples=n_samples, random_state=42)

# # Compute the pairwise distance matrix
# distance_matrix = np.zeros((n_samples, n_samples))
# for i in range(n_samples):
#     for j in range(i+1, n_samples):
#         distance_matrix[i, j] = np.linalg.norm(df_norm_sample[i,:] - df_norm_sample[j,:])
#         distance_matrix[j, i] = distance_matrix[i, j]

# # Perform agglomerative clustering using the distance matrix
# agglo = AgglomerativeClustering(n_clusters=3, affinity='precomputed', linkage='complete')
# agglo.fit(distance_matrix)

# # Get the cluster labels for each data point
# labels = agglo.labels_

# # Compute the importance of each feature based on the agglomerative clustering
# importance = np.zeros(df_norm.shape[1])
# for i in range(n_samples):
#     importance += (labels[i] == agglo.labels_) * df_norm_sample[i,:]
# importance /= np.sum(labels == agglo.labels_, axis=0)

# # Get the indices of the most important features
# top_n_indices = importance.argsort()[::-1][:5]

# # Get the column names for the most important features
# top_n_features = df.columns[top_n_indices].tolist()

# df_selected = df[top_n_features]
