### Review 
### Other clustering algo

In [None]:
import numpy as np
np.random.seed(0)
import pandas as pd
import hvplot.pandas
from sklearn import datasets
import matplotlib.pyplot as plt

In [None]:
X, y = datasets.make_moons(n_samples=500, noise=0.05, random_state=1)

X.shape

y


In [None]:
plt.scatter(X[:,0], X[:,1], c = y)

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
k_model = KMeans(n_clusters=2, random_state=0, n_init=10)
k_model.fit(X)
p = k_model.predict(X)
plt.scatter(X[:,0], X[:,1], c = p)


In [None]:
# Fit and Predict Birch and Agglomerative models
birch_model = Birch(n_clusters=2)
birch_model.fit(X)
p1= birch_model.predict(X)


#Plot
plt.scatter(X[:,0], X[:,1], c = p1)

### When do we use birch
- big datasets
- real time data, cluster dynamically



In [None]:
from sklearn.cluster import DBSCAN

dbscan_model = DBSCAN(eps=0.3)

p2 = dbscan_model.fit_predict(X)

plt.scatter(X[:,0], X[:,1], c = p2)

### Activity 1

### Dimension reduction - pca (principal component analysis)

In [None]:
# Required imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans

## Load the Data Into a Pandas DataFrame

In [26]:
# Read in the CSV file as a Pandas Dataframe
ccinfo_default_df = pd.read_csv(
    Path("Resources/ccinfo_transformed_no_dummies_only_cust_segm.csv")
)

ccinfo_default_df.head()

Unnamed: 0,limit_bal,age,bill_amt,pay_amt,customer_segments
0,-1.117341,24,-0.660703,-0.542779,2
1,-0.349942,26,-0.63637,-0.463994,2
2,-0.580162,34,-0.416808,-0.354013,0
3,-0.887121,37,-0.080152,-0.402077,0
4,-0.887121,57,-0.396855,0.523771,1


In [None]:
# Plot the clusters using the "limit_bal" and "age" columns
ccinfo_default_df.hvplot.scatter(
    x="limit_bal",
    y="age",
    by="customer_segments"
)

In [None]:
# Plot the clusters using the "bill_amt" and "pay_amt" columns
ccinfo_default_df.hvplot.scatter(
    x="bill_amt",
    y="pay_amt",
    by="customer_segments"
)

## Use PCA to reduce the number of factors 

In [None]:
# Import Stadardscaler from sklearn.preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [27]:
# Remove the Customer Segments variable, add inplace=True
ccinfo_default_df.drop(columns='customer_segments', inplace=True)

## PCA explained variance ratio

In [28]:
scaler = StandardScaler()
ccinfo_default_lst = scaler.b(ccinfo_default_df)
ccinfo_default_lst

array([[-1.1173411 , -1.22679159, -0.66070266, -0.5427793 ],
       [-0.3499424 , -1.01094155, -0.63637003, -0.46399421],
       [-0.58016201, -0.14754142, -0.41680786, -0.35401308],
       ...,
       [ 0.26397655,  1.03963376,  1.1152494 , -0.16349243],
       [ 1.10811512,  0.93170874,  3.33813208,  0.76045505],
       [-0.04298292,  0.50000868, -0.66917611, -0.4872953 ]])

In [29]:
ccinfo_default_df_scaled = pd.DataFrame(ccinfo_default_lst,columns=ccinfo_default_df.columns)
ccinfo_default_df_scaled

Unnamed: 0,limit_bal,age,bill_amt,pay_amt
0,-1.117341,-1.226792,-0.660703,-0.542779
1,-0.349942,-1.010942,-0.636370,-0.463994
2,-0.580162,-0.147541,-0.416808,-0.354013
3,-0.887121,0.176234,-0.080152,-0.402077
4,-0.887121,2.334734,-0.396855,0.523771
...,...,...,...,...
4994,-1.117341,0.068309,-0.392558,-0.422089
4995,0.110497,-0.147541,-0.589218,-0.152107
4996,0.263977,1.039634,1.115249,-0.163492
4997,1.108115,0.931709,3.338132,0.760455


## Creating the PCA DataFrame

In [30]:
pca = PCA(n_components=3)
ccinfo_pca = pca.fit_transform(ccinfo_default_df_scaled)
ccinfo_pca

array([[-1.58206442e+00,  9.62349772e-01, -8.13432088e-03],
       [-1.03419484e+00,  7.37959636e-01,  3.48956768e-01],
       [-7.94360670e-01,  2.99270545e-03, -4.45076232e-02],
       ...,
       [ 8.88820486e-01, -7.76856084e-01, -9.65751984e-01],
       [ 3.07865321e+00, -1.79546893e-02, -2.01104790e+00],
       [-5.54318721e-01, -7.26227686e-01,  2.84518572e-01]])

In [32]:
pca.components_

array([[ 0.57570581,  0.221188  ,  0.53865898,  0.57400801],
       [-0.05328826, -0.95007948,  0.2313744 ,  0.20242338],
       [ 0.50399295, -0.15764597, -0.79422425,  0.30057725]])

In [34]:
pca.explained_variance_ratio_
# Create the PCA DataFrame
ccinfo_pca_df = pd.DataFrame(ccinfo_pca, columns= ['pca1','pca2','pca3'])

# Review the PCA DataFrame
ccinfo_pca_df

Unnamed: 0,pca1,pca2,pca3
0,-1.582064,0.962350,-0.008134
1,-1.034195,0.737960,0.348957
2,-0.794361,0.002993,-0.044508
3,-0.745710,-0.220098,-0.532082
4,0.092573,-2.156708,-0.342539
...,...,...,...
4994,-1.081888,-0.181626,-0.388992
4995,-0.373719,-0.032832,0.501200
4996,0.888820,-0.776856,-0.965752
4997,3.078653,-0.017955,-2.011048


## Incorporating the PCA DataFrame into the elbow method

In [36]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1,11))

# Append the value of the computed inertia from the `inertia_` attribute of teh KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1, n_init=10)
    k_model.fit(ccinfo_pca_df)
    inertia += [k_model.inertia_]

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {'k':k, 'inertia':inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow
# Plot the Elbow Curve
df_elbow.hvplot.line(x = 'k', y = 'inertia', title='Elbow Curve', xticks = k)

## Segmention of the PCA data with Kmeans 

In [37]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=1, n_init=10)
# Fit the model
model.fit(ccinfo_pca_df)
# Make predictions
k_3 = model.predict(ccinfo_pca_df)
# Create a copy of the PCA DataFrame
ccinfo_pca_predictions_df = ccinfo_pca_df.copy()
# Add a class column with the labels
ccinfo_pca_predictions_df['customer_segments'] = k_3
ccinfo_pca_predictions_df = pd.concat([ccinfo_pca_predictions_df,ccinfo_default_df], axis=1)
ccinfo_pca_predictions_df

# Plot the clusters
ccinfo_pca_predictions_df.hvplot.scatter(
    x='pca2',
    y='pca3',
    by = 'customer_segments'
)

In [None]:
# Plot the clusters in 3d
import plotly.express as px

fig = px.scatter_3d(ccinfo_pca_predictions_df, 
                    x='pca1', 
                    y='pca2', 
                    z='pca3',
                    color= 'customer_segments'
)

fig.show()

### activity 3,4

### matrix calculation

In [44]:
import numpy as np

X = np.array([[1,2],
              [3,4],
              [5,6]])
X.shape

W = np.array([[.5,1,2,3,4],
              [.1,1,2,3,4]])

Y = np.matmul(X,W)

Y.shape

(3, 5)

### X (3,2) multiply W (2,1), (2,5) (5000, 512) x (512, 1000) = (5000, 1000)