In [133]:
# Initial imports
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [134]:
# Load Dataset
file_path = r"C:\Users\Stephen\Desktop\Class Projects\Cryptocurrencies\iris.csv"
iris_df = pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [135]:
# Drop the "class" field
new_iris_df = iris_df.drop(['class'], axis=1)
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [136]:
#Reorder columns
reorder_iris_df = new_iris_df[['sepal_length', 'petal_length', 'sepal_width', 'petal_width']]
reorder_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [137]:
# Dataframe saved on new csv file.
output_file_path = r"C:\Users\Stephen\Desktop\Class Projects\Cryptocurrencies\new_iris_data.csv"
reorder_iris_df.to_csv(output_file_path, index=False)

In [138]:
# Data loading
file_path = r"C:\Users\Stephen\Desktop\Class Projects\Cryptocurrencies\shopping_data.csv"
df_shopping = pd.read_csv(file_path, encoding='ISO-8859-1')
df_shopping.head(5)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [139]:
# Columns method
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [140]:
# List of datafram data types
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [141]:
# Check for missing values
for column in df_shopping.columns:
    print(f"Column {column} has {df_shopping[column].isnull().sum()} null values")

Column CustomerID has 0 null values
Column Card Member has 2 null values
Column Age has 2 null values
Column Annual Income has 0 null values
Column Spending Score (1-100) has 1 null values


In [142]:
# Drop null rows
df_shopping = df_shopping.dropna()

In [143]:
# Find duplicate entries
print(f"Duplicate entries: {df_shopping.duplicated().sum()}")

Duplicate entries: 0


In [144]:
# Remove customerID column
df_shopping.drop(columns=['CustomerID'], inplace=True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [145]:
# Transform string column
def change_string(member):
    if member == "Yes":
        return 1
    else: 
        return 0
df_shopping['Card Member'] = df_shopping['Card Member'].apply(change_string)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [146]:
# Transform annual income
df_shopping["Annual Income"] = df_shopping['Annual Income']/1000
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [147]:
df_shopping = df_shopping.rename(columns={'Card Member':'Member', 'Annual Income':'Income', 'Spending Score (1-100)':'Score'})
df_shopping.head()

Unnamed: 0,Member,Age,Income,Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [148]:
# Saving cleaned Data
file_path = r"C:\Users\Stephen\Desktop\Class Projects\Cryptocurrencies\shopping_data_cleaned.csv"
df_shopping.to_csv(file_path, index=False)

In [149]:
# Load Dataset
file_path = r"C:\Users\Stephen\Desktop\Class Projects\Cryptocurrencies\new_iris_data.csv"
iris_df = pd.read_csv(file_path)
iris_df.head(10)

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2
5,5.4,1.7,3.9,0.4
6,4.6,1.4,3.4,0.3
7,5.0,1.5,3.4,0.2
8,4.4,1.4,2.9,0.2
9,4.9,1.5,3.1,0.1


In [150]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=3, random_state=5)
model

KMeans(n_clusters=3, random_state=5)

In [151]:
# Fitting model
model.fit(iris_df)

KMeans(n_clusters=3, random_state=5)

In [152]:
# Get predictions
predictions = model.predict(iris_df)
print(predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [153]:
# Add a new class column to iris_df
iris_df['class'] = model.labels_
iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width,class
0,5.1,1.4,3.5,0.2,1
1,4.9,1.4,3.0,0.2,1
2,4.7,1.3,3.2,0.2,1
3,4.6,1.5,3.1,0.2,1
4,5.0,1.4,3.6,0.2,1


In [154]:
# Plotting the clusters with two features
iris_df.hvplot.scatter(x='sepal_length', y='sepal_width', by='class')

In [155]:
# Plotting the clusters with three features
fig = px.scatter_3d(iris_df, x="petal_width", y="sepal_length", z="petal_length", color="class", symbol="class", size="sepal_width",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [156]:
# Load data
file_path = r"C:\Users\Stephen\Desktop\Class Projects\Cryptocurrencies\shopping_data_cleaned.csv"
shopping_df = pd.read_csv(file_path)
shopping_df.head(10)

Unnamed: 0,Member,Age,Income,Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0
5,0,22.0,17.0,76.0
6,0,35.0,18.0,6.0
7,0,23.0,18.0,94.0
8,1,64.0,19.0,3.0
9,0,30.0,19.0,72.0


In [157]:
# See starting points
shopping_df.hvplot.scatter(x="Income", y="Score")

In [161]:
# Function to cluster and plot dataset
def test_cluster_amount(shopping_df, clusters):
   model = KMeans(n_clusters=clusters, random_state=5)
   # Fitting model
   model.fit(shopping_df)



In [162]:
# Get predictions
predictions = model.predict(shopping_df)
print(predictions)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [163]:
# Add a new class column to shopping_df
shopping_df['class'] = model.labels_
shopping_df.head()

ValueError: Length of values does not match length of index