# Read the data

Dataset: https://archive.ics.uci.edu/ml/datasets/Poker+Hand

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 10)

# read the data in and add column names
data_train = pd.read_csv("poker.txt", header=None,
                        names=['S1', 'C1', 'S2', 'C2', 'S3', 'C3','S4', 'C4', 'S5', 'C5', 'CLASS'])


In [None]:
data_train

In [None]:
cluster=data_train[['S1', 'C1', 'S2', 'C2', 'S3', 'C3','S4', 'C4', 'S5', 'C5']] # get train data
cluster

# standarize the data (preprocessing)

In [None]:
from sklearn import preprocessing

# standardize clustering variables to have mean=0 and sd=1 so that card suit and
# rank are on the same scale as to have the variables equally contribute to the analysis
clustervar=cluster.copy() # create a copy -> best practice

clustervar['S1']=preprocessing.scale(clustervar['S1'].astype('float64'))
clustervar['C1']=preprocessing.scale(clustervar['C1'].astype('float64'))
clustervar['S2']=preprocessing.scale(clustervar['S2'].astype('float64'))
clustervar['C2']=preprocessing.scale(clustervar['C2'].astype('float64'))
clustervar['S3']=preprocessing.scale(clustervar['S3'].astype('float64'))
clustervar['C3']=preprocessing.scale(clustervar['C3'].astype('float64'))
clustervar['S4']=preprocessing.scale(clustervar['S4'].astype('float64'))
clustervar['C4']=preprocessing.scale(clustervar['C4'].astype('float64'))
clustervar['S5']=preprocessing.scale(clustervar['S5'].astype('float64'))
clustervar['C5']=preprocessing.scale(clustervar['C5'].astype('float64'))

clus_train = clustervar

In [None]:
clus_train

# K-means

In [None]:
from scipy.spatial.distance import cdist

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=3, init='random', random_state=22) # init = how to select the first cent, Lernverfahren
    
model.fit(clus_train) # learn on training data 
    
clusassign = model.predict(clus_train) # get cluster assignment
    
print(clusassign[:30]) # cluster of first 30 rows
# print(data_train[:30])
model.labels_[:100]

In [None]:
len(clusassign)

### evaluation

In [None]:
from sklearn import metrics

score = metrics.silhouette_score(clus_train, model.labels_, sample_size=1000)
print(score)

The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html

# PCA

In [None]:
from sklearn.decomposition import PCA # PCA from sklearn package

pca_2 = PCA(2) # return 2 first canonical variables > want to reduce to 2 dimensions (in this example from 10)
plot_columns = pca_2.fit_transform(clus_train) # fit PCA to the train dataset
plot_columns

## plot

In [None]:
import matplotlib.pylab as plt

plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model.labels_,) # plot 1st canonical variable on x axis, 2nd on y-axis
plt.xlabel('Canonical variable 1') # pc1, canonical = transformed
plt.ylabel('Canonical variable 2') #pc2
plt.title('Scatterplot of Canonical Variables for 2 Clusters')
plt.show() 
# a color per cluster

In [None]:
import matplotlib.pylab as plt

plt.scatter(x=plot_columns[:,0], y=plot_columns[:,1], c=model.labels_, alpha=0.25) # plot 1st canonical variable on x axis, 2nd on y-axis
plt.xlabel('Canonical variable 1') # pc1, canonical = transformed
plt.ylabel('Canonical variable 2') #pc2
plt.title('Scatterplot of Canonical Variables for 2 Clusters')
plt.show() 
# a color per cluster

# Topic modeling
Assignment is in this area <br>
further info: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000,
                                   stop_words='english')

In [None]:
# read data
with open('politics.txt','r') as pfh, open('sports.txt','r') as sfh:
    politics = pfh.read()
    sports = sfh.read()
pfh.close()
sfh.close()

In [None]:
politics

In [None]:
sports

In [None]:
data = [politics,sports]
data

In [None]:
# tfidf transformation
tfidf = tfidf_vectorizer.fit_transform(data)
tfidf.data[:50]

In [None]:
lda = LatentDirichletAllocation(n_components=10, max_iter=5, # LDA, n_components = number of topics to be found
                                learning_method='online', # 
                                learning_offset=50., 
                                random_state=0) # set seed
                                # doc_topic_prior = alpha = 0.01 per default
                                # topic_word_prior = beta

lda.fit(tfidf)

lda.components_

every value above is connected to a word below (e. g. 0.9064... to '10')

In [None]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print(tfidf_feature_names[:20])

## print out top 10 words

In [None]:
import numpy as np
x =[3,5,7,1,0,9,20,5,13,6]
a = np.argsort(x) # for getting the top words later on
print(a)  # returns index with lowest number first, e. g. index 4 contains number 0

In [None]:
print(a[:-3])

In [None]:
print(a[:-3 : -1]) # last two values, first two descending values

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    top_words = [tfidf_feature_names[i] for i in topic.argsort()[:-10-1:-1]] # -10 = last 10 values & 
    print('Topic:',topic_idx,'--',top_words) # -1 because python starts at 0

## evaluate based on perplexity (Ratlosigkeit, Verwirrung)

In [None]:
pscores = []
for n_topic in [5,10,15,20,30]:
    lda = LatentDirichletAllocation(n_components=n_topic, max_iter=5,random_state=7)

    lda.fit(tfidf)

    perplexity_score = lda.perplexity(tfidf)
    print(perplexity_score)
    pscores.append(perplexity_score)

# pscores
# perplexity score of 0 is best value

In [None]:
## plot the perplexity score with n_topics
plt.plot([5,10,15,20,30],pscores,'r+--')
plt.xlabel('# of topics')
plt.ylabel('Perplexity score')
plt.show()

Hint: le245 = 10^245

**Intepretation: up to 20 topics are reasonable, but no more!**

Practical tipps:
- use this for less than 10'000 documents: https://pypi.org/project/lda/
- use if there are over 10'000 document: https://radimrehurek.com/gensim/models/ldamodel.html