# K-Means Clustering
In diesem Notebook besprechen wir das Vorgehen von K-Means Clustering

## Loading Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import plotly.express as px

## Loading Data

In [6]:
X_train = pd.read_csv('Xtrain_tvshows_featured.csv')
X_test = pd.read_csv('Xtest_tvshows_featured.csv')
y_train = pd.read_csv('ytrain_tvshows.csv')
y_test = pd.read_csv('ytest_tvshows.csv')
print("Shape of X Train: {}".format(X_train.shape))
print("Shape of X Test: {}".format(X_test.shape))
print("Shape of y Train: {}".format(y_train.shape))
print("Shape of y Test: {}".format(y_test.shape))

Shape of X Train: (4294, 10)
Shape of X Test: (1074, 10)
Shape of y Train: (4294, 1)
Shape of y Test: (1074, 1)


In [9]:
# loading the data
data = pd.read_csv("tv_shows.csv")
X = data.drop(columns = ['Age'])
y = data['Age']
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0)

## Binding Target value with explaining variables

In [10]:
train = pd.concat([X_train.reset_index(drop=True),y_train], axis=1)
train.head()

Unnamed: 0,Year,IMDb,Rotten_Tomatoes,Year.1,IMDb.1,Rotten_Tomatoes.1,Netflix,Hulu,Prime Video,Disney+,Age_na
0,0.051282,0.6,0.311111,8,6.2,38.0,0,0,1,0,1
1,0.282051,0.694118,0.7,35,7.0,73.0,0,1,1,0,0
2,0.042735,0.717647,0.677778,7,7.2,71.0,1,0,0,0,0
3,0.034188,0.670588,0.0,6,6.8,10.0,0,0,1,0,1
4,0.0,0.576471,0.466667,2,6.0,52.0,0,1,0,0,0


In [11]:
train.shape

(4294, 11)

In [12]:
test = pd.concat([X_test.reset_index(drop=True),y_test], axis=1)
test.shape

(1074, 11)

In [14]:
train_orig = pd.concat([X_train_orig.reset_index(drop=True),y_train_orig], axis=1)
train_orig.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Age
0,4771.0,5089.0,Curious?,2017.0,,13/100,0.0,0.0,1.0,0.0,1.0,18+
1,1239.0,1242.0,Prank Encounters,2019.0,5.2/10,48/100,1.0,0.0,0.0,0.0,1.0,
2,3662.0,3885.0,Dilbert,1999.0,7.3/10,58/100,0.0,0.0,1.0,0.0,1.0,18+
3,2838.0,2953.0,Little Mosque on the Prairie,2007.0,5.7/10,49/100,0.0,1.0,0.0,0.0,1.0,18+
4,2635.0,2736.0,The Titan Games,2019.0,7.0/10,55/100,0.0,1.0,0.0,0.0,1.0,


## Model

In [15]:
km = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
)
km.fit(train)

## Cluster Centers

In [16]:
cluster_centers = pd.DataFrame(km.cluster_centers_, columns=train.columns)
cluster_centers

Unnamed: 0,Year,IMDb,Rotten_Tomatoes,Year.1,IMDb.1,Rotten_Tomatoes.1,Netflix,Hulu,Prime Video,Disney+,Age_na
0,0.06572,0.682399,0.06421,9.689207,6.90039,15.778934,0.084525,0.204161,0.693108,0.037711,0.957087
1,0.064353,0.674792,0.388677,9.529275,6.835734,44.980948,0.423792,0.251859,0.30158,0.074814,0.413569
2,0.088029,0.765151,0.649405,12.299345,7.603787,68.446468,0.428259,0.431901,0.21413,0.067735,0.069192


## Predict

In [17]:
train_pred = train
train_pred['cluster_predict'] = km.predict(train)

In [18]:
train_pred.shape

(4294, 12)

In [19]:
train_pred.head()

Unnamed: 0,Year,IMDb,Rotten_Tomatoes,Year.1,IMDb.1,Rotten_Tomatoes.1,Netflix,Hulu,Prime Video,Disney+,Age_na,cluster_predict
0,0.051282,0.6,0.311111,8,6.2,38.0,0,0,1,0,1,1
1,0.282051,0.694118,0.7,35,7.0,73.0,0,1,1,0,0,2
2,0.042735,0.717647,0.677778,7,7.2,71.0,1,0,0,0,0,2
3,0.034188,0.670588,0.0,6,6.8,10.0,0,0,1,0,1,0
4,0.0,0.576471,0.466667,2,6.0,52.0,0,1,0,0,0,1


In [20]:
test_pred = test
test_pred['cluster_predict'] = km.predict(test)
test_pred.shape

(1074, 12)

In [21]:
train_orig_pred = train_orig
train_orig_pred = pd.concat([train_orig_pred.reset_index(drop=True),train_pred[['cluster_predict']]], axis=1)

In [22]:
train[['cluster_predict']]

Unnamed: 0,cluster_predict
0,1
1,2
2,2
3,0
4,1
...,...
4289,0
4290,0
4291,1
4292,1


## Interpreting Results

In [23]:
train[['cluster_predict']].groupby('cluster_predict').value_counts()

cluster_predict
0     769
1    2152
2    1373
dtype: int64

In [24]:
test[['cluster_predict']].groupby('cluster_predict').value_counts()

cluster_predict
0    192
1    532
2    350
dtype: int64

In [25]:
train.groupby('cluster_predict').describe()

Unnamed: 0_level_0,Year,Year,Year,Year,Year,Year,Year,Year,IMDb,IMDb,...,Disney+,Disney+,Age_na,Age_na,Age_na,Age_na,Age_na,Age_na,Age_na,Age_na
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
cluster_predict,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,769.0,0.06572,0.08072,0.0,0.034188,0.042735,0.068376,1.0,769.0,0.682399,...,0.0,1.0,769.0,0.957087,0.202793,0.0,1.0,1.0,1.0,1.0
1,2152.0,0.064353,0.081253,0.0,0.017094,0.042735,0.076923,0.666667,2152.0,0.674792,...,0.0,1.0,2152.0,0.413569,0.492587,0.0,0.0,0.0,1.0,1.0
2,1373.0,0.088029,0.101348,0.0,0.025641,0.051282,0.111111,0.598291,1373.0,0.765151,...,0.0,1.0,1373.0,0.069192,0.253872,0.0,0.0,0.0,0.0,1.0


In [26]:
res = train.groupby('cluster_predict').describe()
res = pd.DataFrame(res)
res.to_csv("Results_Clustering.csv",index=False)

In [27]:
train_orig_pred.groupby('cluster_predict').describe()

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,ID,ID,...,Disney+,Disney+,Type,Type,Type,Type,Type,Type,Type,Type
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
cluster_predict,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0.0,685.0,2650.176642,1561.070465,3.0,1279.0,2593.0,4034.0,5363.0,685.0,2779.672993,...,0.0,1.0,685.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1.0,1882.0,2696.975027,1552.695429,0.0,1377.25,2698.0,4032.75,5367.0,1882.0,2829.336344,...,0.0,1.0,1882.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2.0,1190.0,2635.507563,1531.399106,7.0,1276.0,2635.0,3926.25,5349.0,1190.0,2762.408403,...,0.0,1.0,1190.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


## Interpreting Results

In [30]:
def my_boxplot_fct(data,variable):
    fig = px.box(data, x="cluster_predict", y=variable)
    fig.show()


In [39]:
my_boxplot_fct(train_orig_pred, "Age")

In [38]:
my_boxplot_fct(train_orig_pred, "Hulu")

In [33]:
my_boxplot_fct(train_orig_pred, "IMDb")

In [37]:
my_boxplot_fct(train_orig_pred, "Year")

## Visualizing Model
Für die Resultate zu visualisieren müssen wir die vielen Variablen auf eine 2-dimensionale Darstellung bringen. Dafür gibt es so zu sagen Dimension Reduktionsverfahren. Hier verwenden wir das t-SNE Verfahren. Wir gehen nun nicht weiter in die Details ein.

In [36]:
#execudting the tsne method, set random state that you can get the same results as me
tsne = TSNE(random_state=42)
# use fit_transform instead of fit, as TSNE has no transform method
train_tsne = tsne.fit_transform(train)
# saving data as pandas data frame
df_tsne = pd.DataFrame(train_tsne, columns = ['TSNE_DIM1', 'TSNE_DIM2'])
#df_tsne.describe()
# joining the tsne transformed data to the original data
train_pred = pd.concat([train_pred,df_tsne], axis = 1)
train_pred.describe()

Unnamed: 0,Year,IMDb,Rotten_Tomatoes,Year.1,IMDb.1,Rotten_Tomatoes.1,Netflix,Hulu,Prime Video,Disney+,Age_na,cluster_predict,TSNE_DIM1,TSNE_DIM2,TSNE_DIM1.1,TSNE_DIM2.1
count,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0
mean,0.072168,0.705047,0.413937,10.443642,7.092897,47.254308,0.364462,0.300885,0.343735,0.065906,0.400792,1.140661,0.466328,-0.49426,0.466328,-0.49426
std,0.08874,0.129606,0.217222,10.382548,1.101651,19.550018,0.481335,0.458696,0.475009,0.248146,0.490116,0.692215,40.114429,27.403143,40.114429,27.403143
min,0.0,0.0,0.0,2.0,1.1,10.0,0.0,0.0,0.0,0.0,0.0,0.0,-83.443626,-55.497547,-83.443626,-55.497547
25%,0.025641,0.635294,0.288889,5.0,6.5,36.0,0.0,0.0,0.0,0.0,0.0,1.0,-30.617568,-21.321123,-30.617568,-21.321123
50%,0.042735,0.729412,0.422222,7.0,7.3,48.0,0.0,0.0,0.0,0.0,0.0,1.0,1.412686,-0.17377,1.412686,-0.17377
75%,0.08547,0.788235,0.555556,12.0,7.8,60.0,1.0,1.0,1.0,0.0,1.0,2.0,34.332472,21.353398,34.332472,21.353398
max,1.0,1.0,1.0,119.0,9.6,100.0,1.0,1.0,1.0,1.0,1.0,2.0,73.294922,58.482059,73.294922,58.482059


In [35]:
# representing the data
fig = px.scatter(train_pred, x="TSNE_DIM1", y="TSNE_DIM2",color="cluster_predict",
                 labels= {
                     "TSNE_DIM1": "Dimension 1",
                     "TSNE_DIM2": "Dimension 2",
                     "cluster_predict": "Number of associated cluster"
                 },
                 title= "Representation of the dimension reduction by t-SNE")
fig.show()