In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import DBSCAN
from numpy import unique, where
import matplotlib.pyplot as plt

import os
os.chdir('../../')

from modules import preproc
from modules.join_data import join_y
from modules import feature_eng
from modules import cluster_intelligence
from modules.evaluate_model import get_eval_scores
from modules.dbscan_grid_search import run_dbscan_gs
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows',150)
pd.set_option('display.max_columns',1000)

ModuleNotFoundError: No module named 'evaluate_model'

In [None]:
print(os.getcwd())

## Reading in data and preproc

In [None]:
# laptops
laptops = pd.read_json('full_data/laptops.json')
laptops = join_y(laptops, 'raw_data/laptops_sales.csv')
laptops = preproc.clean_cols(laptops)
laptops = preproc.fill_empty_lists(laptops)
laptops = preproc.preprocess_reviews(laptops)
laptops = feature_eng.generate_features(laptops)
laptops = feature_eng.do_PCA(laptops)
laptops['TOTAL_SALES_QBINNED'] = pd.qcut(laptops['TOTAL_SALES'], 3, labels=[0,1,2])

# phones
phones = pd.read_json('full_data/smartphones.json')
phones = join_y(phones, 'raw_data/phone_sales.csv')
phones = preproc.clean_cols(phones)
phones = preproc.fill_empty_lists(phones)
phones = preproc.preprocess_reviews(phones)
phones = feature_eng.generate_features(phones)
phones = feature_eng.do_PCA(phones)
phones['TOTAL_SALES_QBINNED'] = pd.qcut(phones['TOTAL_SALES'], 3, labels=[0,1,2])
# desktops
desktops = pd.read_json('full_data/desktops.json')
desktops = join_y(desktops, 'raw_data/desktops_sales.csv')
desktops = preproc.clean_cols(desktops)
desktops = preproc.fill_empty_lists(desktops)
desktops = preproc.preprocess_reviews(desktops)
desktops = feature_eng.generate_features(desktops)
desktops = feature_eng.do_PCA(desktops)
desktops['TOTAL_SALES_QBINNED'] = pd.qcut(desktops['TOTAL_SALES'], 3, labels=[0,1,2])
# tablets
tablets = pd.read_json('full_data/tablets.json')
tablets = join_y(tablets, 'raw_data/tablet_sales.csv')
tablets = tablets[tablets['no_reviews'].notna()]
tablets = preproc.clean_cols(tablets)
tablets = preproc.fill_empty_lists(tablets)
tablets = preproc.preprocess_reviews(tablets)
tablets = feature_eng.generate_features(tablets)
tablets = feature_eng.do_PCA(tablets)
tablets['TOTAL_SALES_QBINNED'] = pd.qcut(tablets['TOTAL_SALES'], 3, labels=[0,1,2])

## Scaling

In [None]:
laptops = laptops.select_dtypes(include=['float', 'int64','int32','float64'])
laptops['price/Rvol'] = laptops['price/Rvol'].replace(np.inf, laptops['price/Rvol'].median())
for col in laptops.columns:
    laptops[col] = laptops[col].replace(np.nan, laptops[col].median())
laptops = (laptops-laptops.mean())/laptops.std()

laptops = feature_eng.get_y_true(laptops)



In [None]:
phones = phones.select_dtypes(include=['float', 'int64','int32','float64'])
phones['price/Rvol'] = phones['price/Rvol'].replace(np.inf, phones['price/Rvol'].median())
for col in phones.columns:
    phones[col] = phones[col].replace(np.nan, phones[col].median())
phones = (phones-phones.mean())/phones.std()

phones = feature_eng.get_y_true(phones)


In [None]:
tablets = tablets.select_dtypes(include=['float', 'int64','int32','float64'])
tablets['price/Rvol'] = tablets['price/Rvol'].replace(np.inf, tablets['price/Rvol'].median())
for col in tablets.columns:
    tablets[col] = tablets[col].replace(np.nan, tablets[col].median())
tablets = (tablets-tablets.mean())/tablets.std()

tablets = feature_eng.get_y_true(tablets)


In [None]:
desktops = desktops.select_dtypes(include=['float', 'int64','int32','float64'])
desktops['price/Rvol'] = desktops['price/Rvol'].replace(np.inf, desktops['price/Rvol'].median())
for col in desktops.columns:
    desktops[col] = desktops[col].replace(np.nan, desktops[col].median())
desktops = (desktops-desktops.mean())/desktops.std()

desktops = feature_eng.get_y_true(desktops)


## TOP 3 features

In [None]:
laptops.name = 'laptops'
desktops.name = 'desktops'
phones.name = 'phones'
tablets.name = 'tablets'

In [None]:
results = run_dbscan_gs(
    dataframes=[laptops,phones,desktops,tablets],
    features=['no_reviews','Rvol/%rec','pos_reviews'],
    epsilon_range = [x / 100.0 for x in range(50, 100, 5)],
    min_sample_range = [x for x in range(2, 6, 1)],
    iterations=10
)

In [None]:
results

In [None]:
phones['y_true'].value_counts()

## laptops

In [None]:
lX['Rvol/%rec'] = lX['Rvol/%rec'].replace(np.nan, lX['Rvol/%rec'].median())
dbscan_model = DBSCAN(eps=0.8, min_samples=(len(lX))/4)
db_clust = dbscan_model.fit_predict(lX[['no_reviews','Rvol/%rec','pos_reviews']])
laptops['db_clust'] = pd.Series(db_clust, index=laptops.index)
evaluate_clusters(laptops['db_clust'] ,laptops['TOTAL_SALES'],  laptops['TOTAL_SALES_QBINNED'])

In [None]:
cluster_intelligence.cluster_report(lX[['no_reviews','Rvol/%rec','pos_reviews']],db_clust)

In [None]:
get_eval_scores(laptops['y_true'], laptops['db_clust'])

In [None]:
print(laptops.loc[laptops.db_clust==-1]['TOTAL_SALES'].mean())
print(laptops.loc[laptops.db_clust==-0]['TOTAL_SALES'].mean())

In [None]:
lX['db_clust'] = pd.Series(db_clust, index=lX.index)
f, axes = plt.subplots(1,3, figsize=(15,15))
sns.scatterplot(x=lX['no_reviews'], y=lX['pos_reviews'], hue=lX['db_clust'],ax=axes[0])
sns.scatterplot(x=lX['no_reviews'], y=lX['Rvol/%rec'],hue=lX['db_clust'], ax=axes[1])
sns.scatterplot(x=lX['pos_reviews'], y=lX['Rvol/%rec'],hue=lX['db_clust'], ax=axes[2])

In [None]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(lX[['no_reviews','Rvol/%rec','pos_reviews']])
distances, indices = nbrs.kneighbors(lX[['no_reviews','Rvol/%rec','pos_reviews']])

# Plotting K-distance Graph
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure(figsize=(20,10))
plt.plot(distances)
plt.title('K-distance Graph',fontsize=20)
plt.xlabel('Data Points sorted by distance',fontsize=14)
plt.ylabel('Epsilon',fontsize=14)
plt.show()

## phones

In [None]:
pX['Rvol/%rec'] = pX['Rvol/%rec'].replace(np.nan, pX['Rvol/%rec'].median())
dbscan_model = DBSCAN(eps=1, min_samples=(len(pX))/4)
db_clust = dbscan_model.fit_predict(pX[['no_reviews','Rvol/%rec','pos_reviews']])
phones['db_clust'] = pd.Series(db_clust, index=phones.index)
evaluate_clusters(phones['db_clust'] ,phones['TOTAL_SALES'],  phones['TOTAL_SALES_QBINNED'])

In [None]:
cluster_intelligence.cluster_report(pX[['no_reviews','Rvol/%rec','pos_reviews']],db_clust)

In [None]:
get_eval_scores(phones['y_true'], phones['db_clust'])

In [None]:
print(phones.loc[phones.db_clust==-1]['TOTAL_SALES'].mean())
print(phones.loc[phones.db_clust==-0]['TOTAL_SALES'].mean())

In [None]:
pX['db_clust'] = pd.Series(db_clust, index=pX.index)
f, axes = plt.subplots(1,3, figsize=(15,15))
sns.scatterplot(x=pX['no_reviews'], y=pX['pos_reviews'], hue=pX['db_clust'],ax=axes[0])
sns.scatterplot(x=pX['no_reviews'], y=pX['Rvol/%rec'],hue=pX['db_clust'], ax=axes[1])
sns.scatterplot(x=pX['pos_reviews'], y=pX['Rvol/%rec'],hue=pX['db_clust'], ax=axes[2])

## desktops

In [None]:
dX['Rvol/%rec'] = dX['Rvol/%rec'].replace(np.nan, dX['Rvol/%rec'].median())
dbscan_model = DBSCAN(eps=1, min_samples=(len(dX))/4)
db_clust = dbscan_model.fit_predict(dX[['no_reviews','Rvol/%rec','pos_reviews']])
desktops['db_clust'] = pd.Series(db_clust, index=desktops.index)
evaluate_clusters(desktops['db_clust'] ,desktops['TOTAL_SALES'],  desktops['TOTAL_SALES_QBINNED'])

In [None]:
cluster_intelligence.cluster_report(dX[['no_reviews','Rvol/%rec','pos_reviews']],db_clust)

In [None]:
get_eval_scores(desktops['y_true'], desktops['db_clust'])

In [None]:
print(desktops.loc[desktops.db_clust==-1]['TOTAL_SALES'].mean())
print(desktops.loc[desktops.db_clust==-0]['TOTAL_SALES'].mean())

In [None]:
dX['db_clust'] = pd.Series(db_clust, index=dX.index)
f, axes = plt.subplots(1,3, figsize=(15,15))
sns.scatterplot(x=dX['no_reviews'], y=dX['pos_reviews'], hue=dX['db_clust'],ax=axes[0])
sns.scatterplot(x=dX['no_reviews'], y=dX['Rvol/%rec'],hue=dX['db_clust'], ax=axes[1])
sns.scatterplot(x=dX['pos_reviews'], y=dX['Rvol/%rec'],hue=dX['db_clust'], ax=axes[2])

## tablets

In [None]:
tX['Rvol/%rec'] = tX['Rvol/%rec'].replace(np.nan, tX['Rvol/%rec'].median())
dbscan_model = DBSCAN(eps=1, min_samples=(len(tX))/4)
db_clust = dbscan_model.fit_predict(tX[['no_reviews','Rvol/%rec','pos_reviews']])
tablets['db_clust'] = pd.Series(db_clust, index=tablets.index)
evaluate_clusters(tablets['db_clust'] ,tablets['TOTAL_SALES'],  tablets['TOTAL_SALES_QBINNED'])

In [None]:
cluster_intelligence.cluster_report(tX[['no_reviews','Rvol/%rec','pos_reviews']],db_clust)

In [None]:
get_eval_scores(tablets['y_true'], tablets['db_clust'])

In [None]:
print(tablets.loc[tablets.db_clust==-1]['TOTAL_SALES'].mean())
print(tablets.loc[tablets.db_clust==-0]['TOTAL_SALES'].mean())

In [None]:
tX['db_clust'] = pd.Series(db_clust, index=tX.index)
f, axes = plt.subplots(1,3, figsize=(15,15))
sns.scatterplot(x=tX['no_reviews'], y=tX['pos_reviews'], hue=tX['db_clust'],ax=axes[0])
sns.scatterplot(x=tX['no_reviews'], y=tX['Rvol/%rec'],hue=tX['db_clust'], ax=axes[1])
sns.scatterplot(x=tX['pos_reviews'], y=tX['Rvol/%rec'],hue=tX['db_clust'], ax=axes[2])