In [None]:
# Imports
from utils import *

Data Ingestion

In [None]:
base_path = os.getcwd()
path = os.path.join(base_path , 'Fortune 500 2017 - Fortune 500.csv')
frt_500 = pd.read_csv(path)
# Rank seems to be pretty useless - it is based on Revenue 
frt_500 = frt_500.drop('Rank' , axis = 1)

Feature Engg

In [None]:
def feature_engg_chain(data,target,feature_sc):
    '''
    perform feature engg and return df of independent vars and series of dependent vars
    params: data -> input dataframe w/o target var
    params: target -> target var/ dependent var
    params: feature_sc -> sklearn scaler object
    returns: data_sd -> dataframe containing standardized numerical values and unstandardized categorical values (encoded as numbers) and standardized dependent variable 
    '''

    # Some transformations
    # 1. HQZIP shouldn't be int64 - it is a categorical variable
    data['Hqzip'] = data['Hqzip'].astype('str').str.zfill(5)
    # 2. Prftchange shouldn't be an object - it is a float. However it needs to be preprocessed - there are some commas etc
    try: data['Prftchange'] = data['Prftchange'].apply(lambda x : x.replace(',',''))
    except AttributeError: pass
    data['Prftchange'] = data['Prftchange'].astype('float64')

    # Does Geography play a role?
    # Can we encode zip into lat-long info?
    nomi = pgeocode.Nominatim('us')
    data['lat'] = data['Hqzip'].apply(lambda x: nomi.query_postal_code(x).latitude)
    data['long'] = data['Hqzip'].apply(lambda x: nomi.query_postal_code(x).longitude) 

    num_cov = list(data.describe().columns)
    cat_cov = list(data.describe(include = 'object').columns)

    num_data = data.loc[: , num_cov]
    cat_data = data.loc[: , cat_cov]

    # cat_data also has some info in it that may be useful in predicting rev / clustering. Let's label encode them.
    le_1 = LabelEncoder()
    le_2 = LabelEncoder()

    l_encoded_sector = le_1.fit_transform(cat_data.Sector)
    l_encoded_industry = le_2.fit_transform(cat_data.Industry)

    # Dependent vs Independent vars
    # X_num = num_data.loc[: , num_data.columns != 'Revenues']
    # y = target.loc[: , 'Revenues']

    # Remove multicollinearity
    num_data['Profits_per_asset'] = num_data['Profits']/num_data['Assets'] # profit per asset
    num_data['Totshequity_per_asset'] = num_data['Totshequity']/num_data['Assets'] # Totshequity per asset
    num_data.drop(['Profits' , 'Totshequity'] , axis = 1 , inplace = True)
    
    # Scaling independent vars
    num_data_sd = pd.DataFrame(feature_sc.fit_transform(num_data) , columns = num_data.columns)

    X_sd = pd.concat([
        num_data_sd,
        pd.Series(l_encoded_sector , name = 'sector') , 
        pd.Series(l_encoded_industry , name = 'industry')
    ], axis=1)
    
    target_sd = pd.DataFrame(np.log10(target), columns = ["Revenues"])
    data_sd = pd.concat([X_sd , target_sd], axis = 1)
    return data_sd

In [None]:
feature_sc = StandardScaler()
data_sd = feature_engg_chain(frt_500[[col for col in frt_500 if col!="Revenues"]], frt_500.Revenues, feature_sc)

In [None]:
data_sd

In [None]:
fig, ax = plt.subplots(1,4 , figsize = (20,3))

ax[0].boxplot(frt_500.Revenues);
ax[0].set_xticklabels(['Revenues']);
ax[0].set_ylabel('Revenues');

ax[1].hist(frt_500.Revenues , bins = 50)
ax[1].set_xlabel('Revenues');
ax[1].set_ylabel('Number of companies');

ax[2].boxplot(np.log10(frt_500.Revenues));
ax[2].set_xticklabels(['Revenues']);
ax[2].set_ylabel('Log10 Revenues');

ax[3].hist(np.log10(frt_500.Revenues) , bins = 50)
ax[3].set_xlabel('Log10 Revenues');
ax[3].set_ylabel('Number of companies');

### **Clustering**

In [None]:
data_sd.head(3)

In [None]:
n = 2
pca = PCA(n_components=n)
prin_comp = pca.fit_transform(data_sd)
prin_comp_df = pd.DataFrame(prin_comp , columns = ['PC'+str(comp) for comp in range(1,n+1)])
print(prin_comp.shape , pca.explained_variance_ratio_ , sum(pca.explained_variance_ratio_)) # about 98% var is explained. great

In [None]:
cmap = LinearSegmentedColormap.from_list('custom blue', [(0,'#ffc400'),(1,'#ff0000')], N=100)

fig , ax = plt.subplots()

ax1 = ax.scatter(prin_comp_df.PC1 , prin_comp_df.PC2 , c = data_sd['Revenues'] , s = 5, cmap=cmap)
ax.grid(alpha = 0.8);
# ax.set_xscale('log');
ax.set_xlabel('Principal Component 1')

# ax.set_yscale('log');
ax.set_ylabel('Principal Component 2');

cbar = fig.colorbar(ax1)
cbar.set_label('Log10 Revenue')

In [None]:
# Scale Features before clustering
scaler = StandardScaler()
scaled_features = scaler.fit_transform(prin_comp_df)

DBSCAN

In [None]:
# n_neighbors = 5 as kneighbors function returns distance of point to itself (i.e. first column will be zeros) 
n = 10
neighbours = NearestNeighbors(n_neighbors=n)
neighbours_fit = neighbours.fit(scaled_features)
# Find the k-neighbors of a point
neighbours_dist, _ = neighbours.kneighbors(scaled_features)

k_dist = np.sort(neighbours_dist[: , n-1])

plt.plot(k_dist)
plt.axhline(y=0.26, linewidth=1, linestyle='dashed', color='k')
plt.ylabel("k-NN distance")
plt.xlabel(f"Sorted observations ({n-1}th NN)")
plt.show()

In [None]:
clusters = DBSCAN(eps=0.26, min_samples=10).fit(scaled_features)
# get cluster population
counted = Counter(clusters.labels_)
counted.get(-1)

In [None]:
sns.set(rc={'figure.figsize':(8,4),'legend.fontsize':8.5})
p = sns.scatterplot(data=pd.DataFrame(scaled_features ,columns = ["PC1","PC2"]), x="PC1", y="PC2", hue=clusters.labels_, legend="full", palette="deep")
sns.move_legend(p, "upper right", bbox_to_anchor=(1.17, 1), title='Clusters')
plt.show()

In [None]:
# Grid Search through DBSCAN params
min_samples = [5,10,20,25,30,50]
eps = [0.01,0.05,0.1,0.25,0.4]

outliers = {}

import itertools
for samples , eps in itertools.product(min_samples,eps):
    clusters = DBSCAN(eps=eps, min_samples=samples).fit(scaled_features)
    # get cluster population
    counted = Counter(clusters.labels_)
    outliers[(samples,eps)] = counted.get(-1)

# Combination with least outliers
print(combo := min(outliers , key = outliers.get))
samples , eps = combo

In [None]:
clusters = DBSCAN(eps=eps, min_samples=samples).fit(scaled_features)
# get cluster population
counted = Counter(clusters.labels_)

sns.set(rc={'figure.figsize':(8,4),'legend.fontsize':11.5})
p = sns.scatterplot(data=pd.DataFrame(scaled_features ,columns = ["PC1","PC2"]), x="PC1", y="PC2", hue=clusters.labels_, legend="full", palette="deep")
sns.move_legend(p, "upper right", bbox_to_anchor=(1.17, 1), title='Clusters')
plt.show()

K-Means

In [None]:
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": SEED,
}

# A list holds the SSE values for each k
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)

plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.vlines(5, ymin = 0, ymax = 2000 , ls = '--' , color = 'gray');
plt.ylim(0,2000);
plt.show()

In [None]:
kmeans = KMeans(
init="random",
n_clusters=5,
n_init=10,
max_iter=300,
random_state=SEED
)

kmeans.fit(scaled_features)

labels = kmeans.labels_

prin_comp_df['cluster'] = labels
print(prin_comp_df.groupby('cluster').agg({'cluster':'count'}))

fig , ax = plt.subplots()

for cluster in prin_comp_df.cluster.unique():
    df = prin_comp_df[
        prin_comp_df['cluster'] == cluster
    ]

    ax.scatter(df.PC1 , df.PC2 , label = f'cluster no.:-> {cluster}' , s = 10)
    ax.legend(loc = 'best')
    ax.grid(alpha = 0.8);
    # ax.set_xscale('log');
    ax.set_xlabel('Principal Component 1')

    # ax.set_yscale('log');
    ax.set_ylabel('Principal Component 2');

plt.legend(loc = 'best', bbox_to_anchor = (1,1));

### **Cluster Profiling**

In [None]:
# Inverse Transforms
num_cov = ['Employees','Revchange','Prftchange','Assets','lat','long','Profits_per_asset','Totshequity_per_asset']
num_data = pd.DataFrame(feature_sc.inverse_transform(data_sd[num_cov]) , columns = num_cov)

y = 10**(data_sd.Revenues)

sector_industry = data_sd[['sector','industry']]

data_clustered = pd.concat(
    [
        frt_500.Title,
        frt_500.Sector,
        num_data,
        sector_industry,
        pd.Series(labels, name='cluster'),
        y
    ],
    axis = 1
)

In [None]:
data_clustered.groupby(
    [
        'cluster',
        'Sector'
        ]
    ).agg({'Title':'count'}).reset_index().pivot_table(
    index = 'Sector' , columns = 'cluster').fillna(0)

In [None]:
numeric_cols = ['Employees', 'Revchange', 'Prftchange', 'Assets', 'Profits_per_asset', 'Totshequity_per_asset', 'Revenues']

fig, ax = plt.subplots(len(numeric_cols), data_clustered.cluster.nunique() , figsize = (20,20))

for row_idx, row in enumerate(numeric_cols):
    for cluster in data_clustered.cluster.unique():
        ax[row_idx][cluster].hist(data_clustered[data_clustered['cluster'] == cluster][row] , bins = 20, density=True)
        ax[row_idx][cluster].set_xlabel(f'Cluster {cluster}')
        ax[row_idx][cluster].set_ylabel(f'{row}')

### **Regression**

In [None]:
X = frt_500[[col for col in frt_500.columns if col!="Revenues"]]
y = frt_500["Revenues"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
print(X_train.shape)
X_train.head(2)

In [None]:
print(frt_500[[col for col in frt_500 if col!="Revenues"]].shape)
frt_500[[col for col in frt_500 if col!="Revenues"]].head(2)

In [None]:
train_scaler = StandardScaler()
feature_engg_chain(X_train, y_train, train_scaler)