# Improving Designs Using Data Science

## Import libraries

In [None]:
%matplotlib inline

import re
import colorsys

# data and numbers
import numpy as np
import pandas as pd

# plotting
import seaborn as sns;
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import figure

# machine learning
from sklearn.cluster import KMeans
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Load the data

In [None]:
# Load and display the first 5 lines of it
df = pd.read_csv('fortune_500_h1.csv')
df.head(5)

## 1: Prepare the data

##### Handle undefined or faulty data

In [None]:
# remove undefined and nan values
df.replace('undefined', np.nan, inplace=True)
df.dropna(inplace=True)


# dropping duplicate values
df.drop_duplicates(subset=['name'], keep='first', inplace=True)


# remove 'px' from the font size, then convert/round down to integer
df['font-size'] = df['font-size'].apply(lambda v: v if isinstance(v,int) else int(float(v.replace('px', ''))) )

# remove row if font-size is below 9 as we would expect valid headers to be larger than 9px
df.drop(df[ df['font-size'] < 9 ].index, inplace=True)
df.drop(df[ df['font-size'] > 100 ].index, inplace=True)

##### Ensure data types are ones we can work with

In [None]:
# ensure our font weight is an integer
df['font-weight'] = df['font-weight'].apply(int)


# convert sector names to numbers and write to 'sector-codes' column
df["sector"] = df["sector"].astype('category')
df["sector-codes"] = df['sector'].cat.codes


# "rgb(255,155,0)" -> ['255','155','0'] 
split = lambda val: re.sub(r'(rgb|\(|\))', '', val).split(",")

# ['255','155','bad'] -> [255,155,nan] 
parse = lambda val : int(val) if val.isdigit() else np.nan

# [255,155,nan] -> nan or [255,155,0] -> [255,155,0]
clean = lambda arr : np.nan if True in np.isnan(np.array(arr)) else arr

# remove row if color is rgba instead of rgb
df[~df['color'].str.contains("rgba")]

# convert 'rgb(r,g,b)' string to '[r,g,b]' list
df["rgb"] = df["color"].apply(lambda val: clean( list( map( parse, split(val) ) ) ) )

# remove row if color is invalid
df.dropna(subset=['rgb'], inplace=True)

# convert rgb to hsv colors so we have another representation to play around with
df["hsv"] = df["rgb"].apply(lambda rgb: list(colorsys.rgb_to_hsv(rgb[0]/255., rgb[1]/255., rgb[2]/255.)) )

# and put all the rgb/hsv into separate columns
df[['h','s','v']] = df['hsv'].apply(pd.Series)
df[['r','g','b']] = df['rgb'].apply(pd.Series)

# save a hex representation of the color while we're at it
df["hex"] = df["rgb"].apply(lambda rgb: "#{:02x}{:02x}{:02x}".format(rgb[0],rgb[1],rgb[2]) )

##### Remove columns we're not interested in 

In [None]:
df.drop(['backgroundColor', 'name', 'color'], axis=1, inplace=True)

In [None]:
# print out our cleaned up data frame
#df = df.sort_values(by='sector')
#df.to_csv('fortune_500_h1_cleaned.csv', index=False)
df.head(5)

## 2: Looking for answers in data

Now that the data has been prepared we can dive right in.
Without writing much code or doing any math we may get some insights already

### 2.1: Print - Probing for interesting values

In [None]:
# what is the smallest/largest font size used on all the pages?
print( 'Smallest Font Size:', df['font-size'].min() )
print( 'Largest Font Size:', df['font-size'].max() )
print( 'Mean Font Size:', round( df['font-size'].mean(), 1 ) )

In [None]:
# or just simply call describe to get min, max and more values all at once
df['font-size'].describe()

In [None]:
# Group and count all the sectors we have
df['sector'].value_counts()

### 2.1: Plot - Probing for patterns and insights by plots
Let's probe some more by visualizing some values and see if there are any patterns or revelations

##### Pairplot "font-size", "font-weight" and "sector-codes"

In [None]:
sns.set()
# plot "font-size", "font-weight", "sector-codes" against each other
pp = sns.pairplot(df[["font-size", "font-weight", "sector-codes"]], diag_kind="kde", height=5);

for ax in pp.axes.flat:
    ax.yaxis.set_tick_params(labelleft=True)
    ax.xaxis.set_tick_params(labelleft=True)

##### Plot the font sizes used in the different sectors

In [None]:
# show distribution of font sizes for each sector as boxplot >> Food & Drug Store vs Telecommunications
outlier_style = dict(markerfacecolor='0.75', markersize=5, linestyle='none')
sns.set(style="whitegrid", rc={'figure.figsize':(20,17)})
ax = sns.boxplot(x="font-size", y="sector", data=df, orient="h", flierprops=outlier_style)
ax = sns.stripplot(x="font-size", y="sector", data=df, orient="h", size=3, jitter=True, color="#555555")
ax.tick_params(labelsize=20)
ax.set(xlabel='', ylabel='');

##### Plot the colors used in the different sectors

In [None]:
# plot all the H1 colors of all sectors. >> No love for pink and purple?
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='polar')
ax.scatter(x=df['h'] * 2 * np.pi, y=df['s'], c=df["hex"], s=20 + (df['v']) * 150, alpha=0.75, edgecolors='black')
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_ylim([0, 1.1]);

##### Plot the colors used in every individual sector

In [None]:
# plot all the H1 colors but for each sector independently
sectors = df['sector'].unique()

fig, axs = plt.subplots(7, 3, figsize=(20,20), subplot_kw=dict(polar=True))
axs = np.reshape(axs, len(sectors))

for i in range(len(sectors)):
    c = df[ df['sector'] == sectors[i] ];
    axs[i].scatter(x=c['h'] * 2 * np.pi, y=c['s'], c=c["hex"], s=10 + (c['v']) * 100, alpha=0.75, edgecolors='black')
    axs[i].set_title(sectors[i]);
    axs[i].set_xticklabels([]);
    axs[i].set_yticklabels([]);

### 2.3: Machine learning - Probing for patterns, insights, rules with statistics

##### K-Means Clustering

In [None]:
# pick rows that are in the sector "Apparel" or "Engineering & Construction"
X = df[(df['sector'] == "Apparel") | (df['sector'] == "Engineering & Construction")]

# pick the properties we're interested in
X = X[["r", "g",  "b", "sector-codes"]]

# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)

# connectivity matrix for structured Ward clustering algorithm
connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)

num_clusters = 2

# Let's use several clustering functions as we don't know yet which one is best for our data
algorithms = (
    ('MiniBatchKMeans', 
     cluster.MiniBatchKMeans(n_clusters=num_clusters)),
    
    ('MeanShift', 
     cluster.MeanShift(bandwidth=cluster.estimate_bandwidth(X, quantile=.3), bin_seeding=True)),
    
    ('SpectralClustering', 
     cluster.SpectralClustering(n_clusters=num_clusters, eigen_solver='arpack', affinity="nearest_neighbors")),
    
    ('AgglomerativeClustering', 
     cluster.AgglomerativeClustering(n_clusters=num_clusters, linkage='ward', connectivity=connectivity)),
    
    ('Birch', 
     cluster.Birch(n_clusters=num_clusters)),
    
    ('GaussianMixture',
    mixture.GaussianMixture(n_components=num_clusters, covariance_type='full'))
)

fig = plt.figure(figsize=(15, 4))
plt.subplots_adjust(left=.02, right=.98, bottom=.01, top=.96, wspace=.1, hspace=.1)

cluster_colors = np.hstack([np.array([c for c in 'bgrcmykbgrcmyk'])] * 20)

for i in range(len(algorithms)):
    (name, algo) = algorithms[i]
    
    algo.fit(X)
    y_pred = algo.labels_.astype(np.int) if hasattr(algo, 'labels_') else algo.predict(X)
    
    plt.subplot(1, len(algorithms), i+1)
    plt.title(name, size=14)
    plt.scatter(X[:, 0], X[:, 1], s=100, color=cluster_colors[y_pred], edgecolors='white')
    plt.xticks(())
    plt.yticks(())