In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import DBSCAN
from numpy import unique, where
import matplotlib.pyplot as plt

import os
os.chdir('../../')

from modules import preproc
from modules.join_data import join_y
from modules import feature_eng
from modules import cluster_intelligence
from modules.evaluate_model import get_eval_scores
from modules.dbscan_grid_search import run_dbscan_gs, get_best_models, run_dbscan_gs_noeval, get_best_noevals2
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows',150)
pd.set_option('display.max_columns',1000)

## Reading in data and preproc

In [2]:
# laptops
laptops = pd.read_json('full_data/laptops.json')
laptops = join_y(laptops, 'raw_data/laptops_sales.csv')
laptops = preproc.clean_cols(laptops)
laptops = preproc.fill_empty_lists(laptops)
#laptops = preproc.preprocess_reviews(laptops)
laptops = feature_eng.generate_features2(laptops)
laptops = feature_eng.do_PCA2(laptops)
#laptops['TOTAL_SALES_QBINNED'] = pd.qcut(laptops['TOTAL_SALES'], 3, labels=[0,1,2])

# phones
phones = pd.read_json('full_data/smartphones.json')
phones = join_y(phones, 'raw_data/phone_sales.csv')
phones = preproc.clean_cols(phones)
phones = preproc.fill_empty_lists(phones)
#phones = preproc.preprocess_reviews(phones)
phones = feature_eng.generate_features2(phones)
phones = feature_eng.do_PCA2(phones)
#phones['TOTAL_SALES_QBINNED'] = pd.qcut(phones['TOTAL_SALES'], 3, labels=[0,1,2])
# desktops
desktops = pd.read_json('full_data/desktops.json')
desktops = join_y(desktops, 'raw_data/desktops_sales.csv')
desktops = preproc.clean_cols(desktops)
desktops = preproc.fill_empty_lists(desktops)
#desktops = preproc.preprocess_reviews(desktops)
desktops = feature_eng.generate_features2(desktops)
desktops = feature_eng.do_PCA2(desktops)
#desktops['TOTAL_SALES_QBINNED'] = pd.qcut(desktops['TOTAL_SALES'], 3, labels=[0,1,2])
# tablets
tablets = pd.read_json('full_data/tablets.json')
tablets = join_y(tablets, 'raw_data/tablet_sales.csv')
tablets = tablets[tablets['no_reviews'].notna()]
tablets = preproc.clean_cols(tablets)
tablets = preproc.fill_empty_lists(tablets)
#tablets = preproc.preprocess_reviews(tablets)
tablets = feature_eng.generate_features2(tablets)
tablets = feature_eng.do_PCA2(tablets)
#tablets['TOTAL_SALES_QBINNED'] = pd.qcut(tablets['TOTAL_SALES'], 3, labels=[0,1,2])

In [3]:
print(len(laptops))
print(len(desktops))
print(len(tablets))
print(len(phones))

132
32
84
255


In [4]:
# print(laptops['y_true'].value_counts())
# print(phones['y_true'].value_counts())
# print(tablets['y_true'].value_counts())
# print(desktops['y_true'].value_counts())

In [5]:
# # Proportions of best sellers in each dataset
print(124/8)
print(237/18)
print(73/11)
print(27,5)

15.5
13.166666666666666
6.636363636363637
27 5


## Scaling

In [6]:
col_list = ['price','no_reviews','recommendation_percent','Rvol/price','Rvol/%rec','c1','c2','TOTAL_SALES']

In [7]:
laptops = laptops[col_list]

for col in laptops.columns:
    laptops[col] = laptops[col].replace(np.nan, 0).replace(np.inf, 0)
laptops = (laptops-laptops.mean())/laptops.std()





In [8]:
phones = phones[col_list]

for col in phones.columns:
    phones[col] = phones[col].replace(np.nan,0).replace(np.inf,0)
phones = (phones-phones.mean())/phones.std()




In [9]:
tablets = tablets[col_list]
for col in tablets.columns:
    tablets[col] = tablets[col].replace(np.nan, 0).replace(np.inf,0)
tablets = (tablets-tablets.mean())/tablets.std()



In [10]:
desktops = desktops[col_list]

for col in desktops.columns:
    desktops[col] = desktops[col].replace(np.nan, 0).replace(np.inf,0)
desktops = (desktops-desktops.mean())/desktops.std()




In [11]:
laptops = feature_eng.get_y_true(laptops)
phones = feature_eng.get_y_true(phones)
tablets = feature_eng.get_y_true(tablets)
desktops = feature_eng.get_y_true(desktops)

In [12]:
laptops.name = 'laptops'
desktops.name = 'desktops'
phones.name = 'phones'
tablets.name = 'tablets'


In [13]:
results = run_dbscan_gs(
    dataframes=[laptops,phones,desktops,tablets],
    features=['no_reviews','Rvol/%rec','price'],
    epsilon_range = [x / 100.0 for x in range(20, 200, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000
)
(get_best_models(results)).head(10)

Unnamed: 0,precision_mean,recall_mean,epsilon,m_samples_divisor,f1_mean
0,0.430556,0.730619,1.1,4,0.538114
0,0.430556,0.730619,1.15,3,0.538114
0,0.42619,0.730619,1.0,5,0.533749
0,0.42359,0.730619,1.05,4,0.531063
0,0.421305,0.730619,1.35,2,0.525452
0,0.416966,0.730619,1.0,4,0.523912
0,0.419192,0.668119,1.25,3,0.510602
0,0.419192,0.668119,1.2,3,0.510602
0,0.398148,0.730619,0.95,5,0.510016
0,0.412698,0.680619,1.1,5,0.507811


## no_reviews , Rvol/%rec

In [14]:
results = run_dbscan_gs(
    dataframes=[laptops,phones,desktops,tablets],
    features=['no_reviews','price'],
    epsilon_range = [x / 100.0 for x in range(20, 500, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000
)
(get_best_models(results)).head(10)


Unnamed: 0,precision_mean,recall_mean,epsilon,m_samples_divisor,f1_mean
0,0.37506,0.730619,0.8,5,0.493872
0,0.395641,0.668119,0.85,5,0.492052
0,0.386785,0.668119,0.85,4,0.482504
0,0.363523,0.730619,0.75,5,0.481268
0,0.34249,0.730619,0.8,4,0.462446
0,0.378968,0.59173,1.0,3,0.454957
0,0.329808,0.730619,0.75,4,0.448989
0,0.377778,0.57298,0.9,4,0.447529
0,0.345878,0.62298,0.9,3,0.437036
0,0.352725,0.59173,0.95,3,0.433194


In [16]:
results = run_dbscan_gs(
    dataframes=[laptops,phones,desktops,tablets],
    features=['no_reviews','Rvol/%rec'],
    epsilon_range = [x / 100.0 for x in range(10, 500, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000
)
(get_best_models(results)).head(10)

Unnamed: 0,precision_mean,recall_mean,epsilon,m_samples_divisor,f1_mean
0,0.599567,0.680619,0.95,5,0.620455
0,0.587907,0.680619,0.85,7,0.615091
0,0.587907,0.680619,1.15,2,0.615091
0,0.58631,0.680619,0.95,4,0.613538
0,0.58631,0.680619,1.05,2,0.613538
0,0.58631,0.680619,1.1,2,0.613538
0,0.58631,0.680619,0.95,3,0.613538
0,0.580616,0.680619,0.9,4,0.607808
0,0.580616,0.680619,0.9,5,0.607808
0,0.580616,0.680619,0.9,6,0.607808


In [17]:
results.loc[(results.eps==0.95)&(results.m_samples_divisor==5)]

Unnamed: 0,iteration,category,precision,recall,f1,eps,m_samples_divisor,features,no_clusters
0,120,laptops,1.0,0.875,0.933333,0.95,5,"[no_reviews, Rvol/%rec]",2
0,120,phones,0.261905,0.611111,0.366667,0.95,5,"[no_reviews, Rvol/%rec]",2
0,120,desktops,0.5,0.6,0.545455,0.95,5,"[no_reviews, Rvol/%rec]",2
0,120,tablets,0.636364,0.636364,0.636364,0.95,5,"[no_reviews, Rvol/%rec]",2


In [18]:
results = run_dbscan_gs(
    dataframes=[laptops,phones,desktops,tablets],
    features=['Rvol/%rec','price'],
    epsilon_range = [x / 100.0 for x in range(20, 200, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000
)
(get_best_models(results)).head(10)

Unnamed: 0,precision_mean,recall_mean,epsilon,m_samples_divisor,f1_mean
0,0.424862,0.699369,0.85,5,0.523294
0,0.393832,0.699369,0.85,4,0.495964
0,0.440086,0.599937,0.9,4,0.494573
0,0.440086,0.599937,0.9,5,0.494573
0,0.376984,0.730619,0.9,3,0.490539
0,0.369199,0.730619,0.8,5,0.487644
0,0.406811,0.568687,1.0,3,0.46318
0,0.369636,0.636869,0.95,3,0.458015
0,0.33411,0.730619,0.8,4,0.454153
0,0.413953,0.518687,1.0,4,0.446513


In [19]:
results = run_dbscan_gs(
    dataframes=[laptops,phones,desktops,tablets],
    features=['c1','c2'],
    epsilon_range = [x / 100.0 for x in range(20, 200, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000
)
(get_best_models(results)).head(10)

Unnamed: 0,precision_mean,recall_mean,epsilon,m_samples_divisor,f1_mean
0,0.700758,0.413131,1.1,7,0.517619
0,0.700758,0.413131,1.15,7,0.517619
0,0.676282,0.413131,1.15,6,0.509833
0,0.75,0.335354,1.2,7,0.452054
0,0.678571,0.335354,1.2,6,0.443358
0,0.705357,0.317992,1.5,2,0.434288
0,0.75,0.304104,1.25,6,0.421751
0,0.443939,0.503409,1.1,4,0.421711
0,0.439493,0.503409,1.05,4,0.418967
0,0.439493,0.503409,1.15,3,0.418967


In [22]:
results.loc[(results.eps==1.10)&(results.m_samples_divisor==7)]

Unnamed: 0,iteration,category,precision,recall,f1,eps,m_samples_divisor,features,no_clusters
0,60,laptops,1.0,0.5,0.666667,1.1,7,"[c1, c2]",2
0,60,phones,0.636364,0.388889,0.482759,1.1,7,"[c1, c2]",2
0,60,desktops,0.666667,0.4,0.5,1.1,7,"[c1, c2]",2
0,60,tablets,0.5,0.363636,0.421053,1.1,7,"[c1, c2]",2


# Without using Y data

In [23]:
results = run_dbscan_gs_noeval(
    dataframes=[laptops,phones,desktops,tablets],
    features=['no_reviews','Rvol/%rec','price'],
    epsilon_range = [x / 100.0 for x in range(20, 200, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000,
    outlier_prop_target=15

)
get_best_noevals2(results)

Unnamed: 0,iteration,category,eps,m_samples_divisor,features,no_clusters,outlier_prop,precision,recall,f1
0,27,laptops,0.75,7,"[no_reviews, Rvol/%rec, price]",2,15,0.333333,0.875,0.482759
0,52,phones,1.15,7,"[no_reviews, Rvol/%rec, price]",2,16,0.261905,0.611111,0.366667
0,116,tablets,1.2,5,"[no_reviews, Rvol/%rec, price]",2,11,0.4,0.363636,0.380952
0,52,desktops,1.15,7,"[no_reviews, Rvol/%rec, price]",2,15,0.4,0.4,0.4


In [24]:
results = run_dbscan_gs_noeval(
    dataframes=[laptops,phones,desktops,tablets],
    features=['no_reviews','price'],
    epsilon_range = [x / 100.0 for x in range(20, 200, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000,
    outlier_prop_target=15

)
get_best_noevals2(results)

Unnamed: 0,iteration,category,eps,m_samples_divisor,features,no_clusters,outlier_prop,precision,recall,f1
0,23,laptops,0.75,7,"[no_reviews, price]",2,11,0.466667,0.875,0.608696
0,23,phones,0.75,7,"[no_reviews, price]",2,19,0.22449,0.611111,0.328358
0,53,tablets,0.7,7,"[no_reviews, price]",2,14,0.166667,0.181818,0.173913
0,23,desktops,0.75,7,"[no_reviews, price]",2,18,0.5,0.6,0.545455


In [25]:
results = run_dbscan_gs_noeval(
    dataframes=[laptops,phones,desktops,tablets],
    features=['no_reviews','Rvol/%rec'],
    epsilon_range = [x / 100.0 for x in range(20, 200, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000,
    outlier_prop_target=15

)
get_best_noevals2(results)

Unnamed: 0,iteration,category,eps,m_samples_divisor,features,no_clusters,outlier_prop,precision,recall,f1
0,23,laptops,0.4,7,"[no_reviews, Rvol/%rec]",2,10,0.5,0.875,0.636364
0,2,phones,1.0,7,"[no_reviews, Rvol/%rec]",2,12,0.242424,0.444444,0.313725
0,64,tablets,0.8,7,"[no_reviews, Rvol/%rec]",2,14,0.583333,0.636364,0.608696
0,35,desktops,0.65,7,"[no_reviews, Rvol/%rec]",2,18,0.5,0.6,0.545455


In [26]:
results = run_dbscan_gs_noeval(
    dataframes=[laptops,phones,desktops,tablets],
    features=['no_reviews','Rvol/%rec'],
    epsilon_range = [x / 100.0 for x in range(20, 200, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000,
    outlier_prop_target=15

)
get_best_noevals2(results)

Unnamed: 0,iteration,category,eps,m_samples_divisor,features,no_clusters,outlier_prop,precision,recall,f1
0,40,laptops,0.35,7,"[no_reviews, Rvol/%rec]",2,10,0.5,0.875,0.636364
0,50,phones,0.8,7,"[no_reviews, Rvol/%rec]",2,18,0.234043,0.611111,0.338462
0,50,tablets,0.8,7,"[no_reviews, Rvol/%rec]",2,14,0.583333,0.636364,0.608696
0,24,desktops,0.55,7,"[no_reviews, Rvol/%rec]",2,18,0.5,0.6,0.545455


In [27]:
results = run_dbscan_gs_noeval(
    dataframes=[laptops,phones,desktops,tablets],
    features=['price','Rvol/%rec'],
    epsilon_range = [x / 100.0 for x in range(20, 200, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000,
    outlier_prop_target=15

)
get_best_noevals2(results)

Unnamed: 0,iteration,category,eps,m_samples_divisor,features,no_clusters,outlier_prop,precision,recall,f1
0,57,laptops,0.6,7,"[price, Rvol/%rec]",2,15,0.333333,0.875,0.482759
0,17,phones,0.9,7,"[price, Rvol/%rec]",2,16,0.268293,0.611111,0.372881
0,100,tablets,0.75,7,"[price, Rvol/%rec]",2,11,0.2,0.181818,0.190476
0,17,desktops,0.9,7,"[price, Rvol/%rec]",2,15,0.6,0.6,0.6


In [28]:
results = run_dbscan_gs_noeval(
    dataframes=[laptops,phones,desktops,tablets],
    features=['c1','c2'],
    epsilon_range = [x / 100.0 for x in range(20, 200, 5)],
    min_sample_range = [x for x in range(2, 8, 1)],
    iterations=5000,
    outlier_prop_target=15

)
get_best_noevals2(results)

Unnamed: 0,iteration,category,eps,m_samples_divisor,features,no_clusters,outlier_prop,precision,recall,f1
0,274,phones,0.9,7,"[c1, c2]",2,12,0.28125,0.5,0.36
0,124,tablets,0.8,7,"[c1, c2]",2,13,0.363636,0.363636,0.363636
0,73,desktops,0.95,6,"[c1, c2]",2,12,0.5,0.4,0.444444


In [None]:
def evaluate_clusters(
    predicted_y,
    real_y,
):
    """Evaluates clustering results, uses silouette and rand if specified. Cannot be used with DBSCAN"""
    sns.kdeplot(predicted_y, real_y.astype(int), shade=True,shade_lowest=True,cbar=True)
    plt.title('Cluster labels with total sales')

## laptops

In [None]:
from sklearn.cluster import KMeans   
dbscan_model = DBSCAN(eps=0.6, min_samples=(len(laptops))/6)
db_clust = dbscan_model.fit_predict(laptops[['no_reviews']])
laptops['db_clust'] = pd.Series(db_clust, index=laptops.index)
get_eval_scores(laptops['y_true'], laptops['db_clust'])
#cluster_intelligence.evaluate_clusters(laptops['db_clust'] ,laptops['TOTAL_SALES'])

#get_eval_scores(laptops['y_true'], laptops['km'])

In [None]:
kmw2 = KMeans(
    n_clusters=2, init='random',
    n_init=50, max_iter=500, 
    tol=1e-06, random_state=None,
    algorithm='full'
)
km = kmw2.fit_predict(laptops[['no_reviews','Rvol/%rec']])
laptops['km'] = pd.Series(km, index=laptops.index)

In [None]:
from sklearn.cluster import AgglomerativeClustering
AP =  AgglomerativeClustering(n_clusters=2)
Apm = AP.fit_predict(laptops[['no_reviews','Rvol/%rec']])
laptops['ap'] = pd.Series(Apm, index=laptops.index)

In [None]:
laptops['ap'].value_counts()

In [None]:
def get_eval_scores(
    y_true,
    y_pred,
    print_results = False,
):
    """Calculates precision and recall scores of input y-vals"""
    pscore = precision_score(y_true, y_pred, pos_label=-1)
    rscore = recall_score(y_true, y_pred, pos_label=-1)
    f1score = f1_score(y_true, y_pred, pos_label=-1)
    if print_results:
        print(f"The precision score is: {pscore} and the recall is {rscore} and the f1score is {f1score}")

    return pscore, rscore, f1score

In [None]:
dbscan_model = DBSCAN(eps=0.95, min_samples=(len(laptops))/5)
db_clust = dbscan_model.fit_predict(laptops[['no_reviews','%rec/Rvol']])
laptops['db_clust'] = pd.Series(db_clust, index=laptops.index)
get_eval_scores(laptops['y_true'], laptops['db_clust'])

In [None]:
cluster_intelligence.cluster_report(laptops[['no_reviews']],db_clust)

In [None]:
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score
def get_eval_scores(
    y_true,
    y_pred,
    print_results = False,
):
    """Calculates precision and recall scores of input y-vals"""
    pscore = precision_score(y_true, y_pred, pos_label=-1)
    rscore = recall_score(y_true, y_pred, pos_label=-1)
    f1score = f1_score(y_true, y_pred, pos_label=-1)
    if print_results:
        print(f"The precision score is: {pscore} and the recall is {rscore} and the f1score is {f1score}")

    return pscore, rscore, f1score

In [None]:
print(laptops.loc[laptops.db_clust==-1]['TOTAL_SALES'].mean())
print(laptops.loc[laptops.db_clust==-0]['TOTAL_SALES'].mean())

In [None]:
laptops['db_clust'] = pd.Series(db_clust, index=laptops.index)
f, axes = plt.subplots(1,3, figsize=(15,15))
sns.scatterplot(x=laptops['no_reviews'], y=laptops['pos_reviews'], hue=laptops['db_clust'],ax=axes[0])
sns.scatterplot(x=laptops['no_reviews'], y=laptops['Rvol/%rec'],hue=laptops['db_clust'], ax=axes[1])
sns.scatterplot(x=laptops['pos_reviews'], y=laptops['Rvol/%rec'],hue=laptops['db_clust'], ax=axes[2])

## phones

In [None]:

dbscan_model = DBSCAN(eps=0.6, min_samples=(len(phones))/6)
db_clust = dbscan_model.fit_predict(phones[['no_reviews']])
phones['db_clust'] = pd.Series(db_clust, index=phones.index)
get_eval_scores(phones['y_true'], phones['db_clust'])
#evaluate_clusters(phones['db_clust'] ,phones['TOTAL_SALES'])

In [None]:
dbscan_model = DBSCAN(eps=0.95, min_samples=(len(phones))/5)
db_clust = dbscan_model.fit_predict(phones[['no_reviews','%rec/Rvol']])
phones['db_clust'] = pd.Series(db_clust, index=phones.index)
get_eval_scores(phones['y_true'], phones['db_clust'])

In [None]:
print(phones.loc[phones.db_clust==-1]['TOTAL_SALES'].mean())
print(phones.loc[phones.db_clust==-0]['TOTAL_SALES'].mean())

In [None]:
phones.loc[phones['y_true']==-1]

10 is a nokia brick phone with fairly low number of reviews - who reviews this. All of the others are Samsungs with no reviews (median filler)


In [None]:

dbscan_model = DBSCAN(eps=0.6, min_samples=(len(desktops))/6)
db_clust = dbscan_model.fit_predict(desktops[['no_reviews']])
desktops['db_clust'] = pd.Series(db_clust, index=desktops.index)
get_eval_scores(desktops['y_true'], desktops['db_clust'])

In [None]:
dbscan_model = DBSCAN(eps=0.95, min_samples=(len(desktops))/5)
db_clust = dbscan_model.fit_predict(desktops[['no_reviews','Rvol/%rec']])
desktops['db_clust'] = pd.Series(db_clust, index=desktops.index)
get_eval_scores(desktops['y_true'], desktops['db_clust'])

In [None]:
desktops['db_clust'] = pd.Series(db_clust, index=desktops.index)
f, axes = plt.subplots(1,3, figsize=(15,15))
sns.scatterplot(x=desktops['no_reviews'], y=desktops['pos_reviews'], hue=desktops['db_clust'],ax=axes[0])
sns.scatterplot(x=desktops['no_reviews'], y=desktops['Rvol/%rec'],hue=desktops['db_clust'], ax=axes[1])
sns.scatterplot(x=desktops['pos_reviews'], y=desktops['Rvol/%rec'],hue=desktops['db_clust'], ax=axes[2])

## tablets

In [None]:

dbscan_model = DBSCAN(eps=1.2, min_samples=(len(tablets))/5)
db_clust = dbscan_model.fit_predict(tablets[['no_reviews']])
tablets['db_clust'] = pd.Series(db_clust, index=tablets.index)
get_eval_scores(tablets['y_true'], tablets['db_clust'])

In [None]:
dbscan_model = DBSCAN(eps=0.95, min_samples=(len(tablets))/5)
db_clust = dbscan_model.fit_predict(tablets[['no_reviews','Rvol/%rec']])
tablets['db_clust'] = pd.Series(db_clust, index=tablets.index)
get_eval_scores(tablets['y_true'], tablets['db_clust'])

In [None]:
print(tablets.loc[tablets.db_clust==-1]['TOTAL_SALES'].mean())
print(tablets.loc[tablets.db_clust==-0]['TOTAL_SALES'].mean())

In [None]:
tablets['db_clust'] = pd.Series(db_clust, index=tablets.index)
f, axes = plt.subplots(1,3, figsize=(15,15))
sns.scatterplot(x=tablets['no_reviews'], y=tablets['pos_reviews'], hue=tablets['db_clust'],ax=axes[0])
sns.scatterplot(x=tablets['no_reviews'], y=tablets['Rvol/%rec'],hue=tablets['db_clust'], ax=axes[1])
sns.scatterplot(x=tablets['pos_reviews'], y=tablets['Rvol/%rec'],hue=tablets['db_clust'], ax=axes[2])