# Modeling step

# Import tools

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans

# Import data

In [None]:
# game_ratings = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/cleaned_game_ratings.csv', index_col=0)

# From Count Vectorizer
bigram_cv = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/bigram_cv.csv')

In [None]:
trigram_cv = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/trigram_cv.csv')

In [None]:
# From TF-IDF Vectorizer
bigram_tf = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/bigram_tf.csv')

In [None]:
trigram_tf = pd.read_csv('/content/drive/MyDrive/Springboard DS/data/trigram_tf.csv')

# Distance measures

## Binary matrix

### Hamming distance

In [None]:
hamming_bigram_cv = pairwise_distances(X=bigram_cv, metric='hamming')
hamming_trigram_cv = pairwise_distances(X=trigram_cv, metric='hamming')

hamming_bigram_tf = pairwise_distances(X=bigram_tf, metric='hamming')
hamming_trigram_tf = pairwise_distances(X=trigram_tf, metric='hamming')

### Jaccard index

In [None]:
jaccard_bigram_cv = pairwise_distances(X=bigram_cv, metric='jaccard')
jaccard_trigram_cv = pairwise_distances(X=trigram_cv, metric='jaccard')

jaccard_bigram_tf = pairwise_distances(X=bigram_tf, metric='jaccard')
jaccard_trigram_tf = pairwise_distances(X=trigram_tf, metric='jaccard')

## Continuous matrix

### Euclidean distance

In [None]:
eucl_bigram_cv = pairwise_distances(X=bigram_cv, metric='euclidean')
eucl_trigram_cv = pairwise_distances(X=trigram_cv, metric='euclidean')

eucl_bigram_tf = pairwise_distances(X=bigram_tf, metric='euclidean')
eucl_trigram_tf = pairwise_distances(X=trigram_tf, metric='euclidean')

### Cosine similarity

In [None]:
cosine_bigram_cv = pairwise_distances(X=bigram_cv, metric='cosine')
cosine_trigram_cv = pairwise_distances(X=trigram_cv, metric='cosine')

cosine_bigram_tf = pairwise_distances(X=bigram_tf, metric='cosine')
cosine_trigram_tf = pairwise_distances(X=trigram_tf, metric='cosine')

# Clustering: K-Means

Set up KMeans, calculate clusters from labels, fit to training data, and predict y_pred for each

In [None]:
true_k_bigram_cv = np.unique(bigram_cv.columns).shape[0]
true_k_trigram_cv = np.unique(trigram_cv.columns).shape[0]

true_k_bigram_tf = np.unique(bigram_tf.columns).shape[0]
true_k_trigram_cv = np.unique(trigram_tf.columns).shape[0]
# Pulled the concept for true_k from: https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py

bigram_cv_kmeans_pred = KMeans(n_clusters=true_k_bigram_cv, random_state=42, max_iter=50).fit_predict(bigram_cv)
trigram_cv_kmeans_pred = KMeans(n_clusters=true_k_trigram_cv, random_state=42, max_iter=50).fit_predict(trigram_cv)

bigram_tf_kmeans_pred = KMeans(n_clusters=true_k_bigram_tf, random_state=42, max_iter=50).fit_predict(bigram_tf)
trigram_tf_kmeans_pred = KMeans(n_clusters=true_k_trigram_tf, random_state=42, max_iter=50).fit_predict(trigram_tf)

Metrics from running KMeans

In [None]:
print("Bigrams - CountVectorizer")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(bigram_cv.columns, bigram_cv_kmeans_pred.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(bigram_cv.columns, bigram_cv_kmeans_pred.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(bigram_cv.columns, bigram_cv_kmeans_pred.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(bigram_cv.columns, bigram_cv_kmeans_pred.labels_))

print()
print("Trigrams - CountVectorizer")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(trigram_cv.columns, trigram_cv_kmeans_pred.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(trigram_cv.columns, trigram_cv_kmeans_pred.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(trigram_cv.columns, trigram_cv_kmeans_pred.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(trigram_cv.columns, trigram_cv_kmeans_pred.labels_))

print()
print("Bigrams - TF-IDF Vectorizer")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(bigram_tf.columns, bigram_tf_kmeans_pred.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(bigram_tf.columns, bigram_tf_kmeans_pred.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(bigram_tf.columns, bigram_tf_kmeans_pred.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(bigram_tf.columns, bigram_tf_kmeans_pred.labels_))

print()
print("Trigrams - TF-IDF Vectorizer")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(trigram_tf.columns, trigram_tf_kmeans_pred.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(trigram_tf.columns, trigram_tf_kmeans_pred.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(trigram_tf.columns, trigram_tf_kmeans_pred.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(trigram_tf.columns, trigram_tf_kmeans_pred.labels_))

Top terms per cluster

In [None]:
print("Top terms: bigrams from CountVectorizer")
bi_cv_order_centroids = bigram_cv_kmeans_pred.cluster_centers_.argsort()[:, ::-1]

for i in range(true_k_bigram_cv):
    print("Cluster %d:" % i, end="")
    for ind in bi_cv_order_centroids[i, :10]:
        print(" %s" % bigram_cv.columns[ind], end="")
    print()


print("Top terms: trigrams from CountVectorizer")
tri_cv_order_centroids = trigram_cv_kmeans_pred.cluster_centers_.argsort()[:, ::-1]

for i in range(true_k_trigram_cv):
    print("Cluster %d:" % i, end="")
    for ind in tri_cv_order_centroids[i, :10]:
        print(" %s" % trigram_cv.columns[ind], end="")
    print()


print("Top terms: bigrams from TF-IDF Vectorizer")
bi_tf_order_centroids = bigram_tf_kmeans_pred.cluster_centers_.argsort()[:, ::-1]

for i in range(true_k_bigram_tf):
    print("Cluster %d:" % i, end="")
    for ind in bi_tf_order_centroids[i, :10]:
        print(" %s" % bigram_tf.columns[ind], end="")
    print()


print("Top terms: bigrams from TF-IDF Vectorizer")
tri_tf_order_centroids = trigram_tf_kmeans_pred.cluster_centers_.argsort()[:, ::-1]

for i in range(true_k_trigram_tf):
    print("Cluster %d:" % i, end="")
    for ind in tri_tf_order_centroids[i, :10]:
        print(" %s" % trigram_tf.columns[ind], end="")
    print()

# Topic modeling

## NMF

## LDA

# Summary and Conclusions