In [None]:
import pandas as pd

df_train = pd.read_json("../data/news_train.json", orient='records')

df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_train.groupby(["label"]).count().index

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

w_n_lemmatizer = WordNetLemmatizer() 

nltk.download("stopwords")
nltk.download('punkt')

stop_words = set(stopwords.words("english"))

df_train["text"] = df_train["text"].apply(lambda row: row.lower())
df_train["text"] = df_train["text"].apply(lambda row: " ".join([w_n_lemmatizer.lemmatize(word) for word in word_tokenize(row) if not word in stop_words and word.isalpha()]))

df_train.head()

In [None]:
# %pip install spacy

In [None]:
# !python -m spacy download en_core_web_sm

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

## Text vectorizing with embedding word vectors

In [None]:
import numpy as np
from tqdm import tqdm

vectors = []

for item in tqdm(df_train["text"].values):
    doc = nlp(item)
    tmp = [word.vector for word in doc]
    count = len(tmp)
    vectors.append(sum(tmp) / count)

vectors = np.array(vectors)

len(vectors)

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

maxK=21
step=1
K = range(2, maxK, step)

distortions = []
tbar = tqdm(K)
for k in tbar:
    kmeanModel = KMeans(n_clusters=k, verbose=0)
    kmeanModel.fit(vectors)
    distortions.append(sum(np.min(cdist(vectors, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / vectors.shape[0])
    tbar.set_description("K: " + str(k))
    
plt.figure()
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')

In [None]:
# !pip install yellowbrick

In [None]:
from yellowbrick.cluster import KElbowVisualizer

model = KMeans()
visualizer = KElbowVisualizer(model, k=range(2, 20, 1), metric="distortion")
visualizer.fit(vectors)
visualizer.show()

In [None]:
visualizer.elbow_value_

In [None]:
optimal_cluster_naumber = visualizer.elbow_value_
clusterer = KMeans(n_clusters=optimal_cluster_naumber)
cluster_labels = clusterer.fit_predict(vectors)
len(cluster_labels)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne = tsne.fit_transform(vectors)

In [None]:
df_train["tsne-1d"] = tsne[:,0]
df_train["tsne-2d"] = tsne[:,1]

df_train.head(5)

In [None]:
# pip install seaborn

In [None]:
import seaborn as sns

df_train['cluster'] = cluster_labels

sns.scatterplot(data=df_train,
                hue="cluster",
                palette=sns.color_palette("hls", optimal_cluster_naumber),
                x="tsne-1d", 
                y="tsne-2d")

In [None]:
from sklearn.decomposition import PCA

pca_2 = PCA(n_components=2)
pca_2_result = pca_2.fit_transform(vectors)

print("Cumulative variation for 2 principal components: ", np.sum(pca_2.explained_variance_ratio_))

In [None]:
df_train["pca-1d"] = pca_2_result[:,0]
df_train["pca-2d"] = pca_2_result[:,1]

df_train.head()

In [None]:
sns.scatterplot(data=df_train,
                hue="cluster",
                palette=sns.color_palette("hls", optimal_cluster_naumber),
                x="pca-1d", 
                y="pca-2d")

In [None]:
def vis_evr(evr, x_min=0, x_max=800, x_step=50, y_min = 0, y_max=1., y_step=0.1):
  fig = plt.figure()
  ax = fig.gca()
  ax.set_xticks(np.arange(x_min, x_max, x_step))
  ax.set_yticks(np.arange(y_min, y_max, y_step))
  plt.plot(evr)
  plt.grid(linestyle='-', linewidth=1)


  plt.xlabel('number of components')
  plt.ylabel('cumulative explained variance');

In [None]:
pca = PCA().fit(vectors)
evr = np.cumsum(pca.explained_variance_ratio_)

vis_evr(evr)

In [None]:
vis_evr(evr[40:100], x_max=60, x_step=5, y_step=0.01)

In [None]:
pca_n = PCA(n_components=50)
pca_n_result = pca_n.fit_transform(vectors)

print("Cumulative variation for 50 principal components: ", np.sum(pca_n.explained_variance_ratio_))

In [None]:
df_test = pd.read_json("../data/news_test.json", orient='records')

df_test.info()
df_test.hist()
df_test.head()

In [None]:
df_test["text"] = df_test["text"].apply(lambda row: row.lower())
df_test["text"] = df_test["text"].apply(lambda row: " ".join([w_n_lemmatizer.lemmatize(word) for word in word_tokenize(row) if not word in stop_words and word.isalpha()]))

vectors_test = []

for item in tqdm(df_test["text"].values):
    doc = nlp(item)
    tmp = [word.vector for word in doc]
    count = len(tmp)
    vectors_test.append(sum(tmp) / count)

vectors_test = np.array(vectors_test)

# Predicition

In [None]:
clusterer_test = KMeans(n_clusters=4)
clusterer_test.fit(vectors)

df_test["vectors"] = list(vectors_test)
df_test["cluster"] = clusterer_test.predict(vectors_test)
df_test.head(10)

In [None]:
df_test["transform_cluster"] = [-1] * len(df_test["cluster"])
df_test.head(2)

In [None]:
def replace_value(cluster_value,transform_cluster, df):
    for i in tqdm(range(len(df_test["cluster"]))): 
        if df["cluster"].values[i] == cluster_value: 
            df["transform_cluster"].values[i]=transform_cluster

replace_value(3,1,df_test)
replace_value(2,2,df_test)
replace_value(1,3,df_test)
replace_value(0,0,df_test)

In [None]:
df_test.head(5)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

print("MSE:", mean_squared_error(df_test["label"].values, df_test["transform_cluster"].values))
print("ACC:", accuracy_score(df_test["label"].values, df_test["transform_cluster"].values))