In [None]:
# Installér spacy sprogmodel for dansk (hvis ikke installeret i forvejen)

#!python -m spacy download 'da_core_news_sm'

In [None]:
# indlæs pakker

import os
from os.path import join
import re
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

import plotnine
plotnine.options.figure_size = (12, 10)

import spacy

nlp = spacy.load('da_core_news_sm')

In [None]:
# indlæs data

data_p = join('/work', 'teaching-materials', 'data', 'polparties')

# sf texts
sf = []

sf_files = os.listdir(join(data_p, 'sf'))

for filename in sf_files:
    if filename.endswith('.txt'):
        text = {}
        
        text['name'] = filename
        text['party'] = 'sf'
        text['text'] = open(join(data_p, 'sf', filename)).read()
        
        sf.append(text)
        
# c texts
cons = []

cons_files = os.listdir(join(data_p, 'konservative'))

for filename in cons_files:
    if filename.endswith('.txt'):
        text = {}
        
        text['name'] = filename
        text['party'] = 'konservative'
        text['text'] = open(join(data_p, 'konservative', filename)).read()
        
        cons.append(text)
        
# df texts
df = []

df_files = os.listdir(join(data_p, 'df'))

for filename in df_files:
    if filename.endswith('.txt'):
        text = {}
        
        text['name'] = filename
        text['party'] = 'df'
        text['text'] = open(join(data_p, 'df', filename)).read()
        
        df.append(text)
        
# combined
poltext_data = sf + cons + df

# just texts
poltexts = [poltext.get('text') for poltext in poltext_data]

# as dataframe
poltext_df = pd.DataFrame.from_records(poltext_data)

In [None]:
# dictionary struktur
poltext_data[0]

In [None]:
# kun text
poltexts[0]

In [None]:
# data frame
poltext_df.head()

## Klyngeanalyse af tekster: CountVectorizer (rå tællinger af ord)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df = 0.05, max_df = 0.9) # dan vectorizerfunktion
transformed_documents = vectorizer.fit_transform(poltexts) # brug vectorizer på tekster

# Konverter fittet vectorizer til array
transformed_documents_as_array = transformed_documents.toarray()

# Konverter til data frame
count_df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names_out())

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(3, random_state = 142)

identified_clusters = kmeans.fit_predict(count_df)
identified_clusters

In [None]:
poltext_df['cluster_countvec'] = identified_clusters
poltext_df['cluster_countvec'] = poltext_df['cluster_countvec'].astype('str') # omdannes til string for at gøre den kategorisk

In [None]:
poltext_df.groupby(['party', 'cluster_countvec']).size() / poltext_df.groupby(['party']).size()

In [None]:
from plotnine import ggplot, aes, geom_bar

(ggplot(data = poltext_df, mapping = aes(x = 'party', group = 'cluster_countvec', fill = 'cluster_countvec')) 
 + geom_bar(position = 'fill'))

## Klyngeanalyse ud fra TfIdf vectorizer (Tf-idf vægtning af ord)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 0.05, max_df = 0.9) # dan vectorizerfunktion
transformed_documents = vectorizer.fit_transform(poltexts) # brug vectorizer på tekster

# Konverter fittet vectorizer til array
transformed_documents_as_array = transformed_documents.toarray()

# Konverter til data frame
tfidf_df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names_out())

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(3, random_state = 142)

identified_clusters = kmeans.fit_predict(tfidf_df)
identified_clusters

In [None]:
poltext_df['cluster_tfidfvec'] = identified_clusters
poltext_df['cluster_tfidfvec'] = poltext_df['cluster_tfidfvec'].astype('str') # omdannes til string for at gøre den kategorisk

In [None]:
poltext_df.groupby(['party', 'cluster_tfidfvec']).size() / poltext_df.groupby(['party']).size()

In [None]:
import plotnine
plotnine.options.figure_size = (12, 10)
from plotnine import ggplot, aes, geom_bar

(ggplot(data = poltext_df, mapping = aes(x = 'party', group = 'cluster_tfidfvec', fill = 'cluster_tfidfvec')) 
 + geom_bar(position = 'fill'))

## Klyngeanalyse ud fra egen tokenizer og TfIdf vectorizer

In [None]:
# dan tokenizer

def tokenizer(text):
    
    custom_stops = ["del"] # Definerer kontekstspecifikke stopord
    default_stopwords = list(nlp.Defaults.stop_words) # Indlæser prædefineret stopordsliste
    stop_words = default_stopwords + custom_stops # Danner samlet stopordsliste
    
    pos_tags = ['PROPN', 'ADJ', 'NOUN'] # Definerer POS-tags som skal bevares

    doc = nlp(text)

    tokens = []

    for word in doc: # Looper igennem hvert ord i tweet
        if (len(word.lemma_) < 3): # Ord må ikke være mindre end 3 karakterer - går videre til næste ord, hvis det er
            continue
        if (word.pos_ in pos_tags) and (word.lemma_ not in stop_words): # Tjek at ordets POS-tag indgår i listen af accepterede tags og at ordet ikke er stopord
            tokens.append(word.lemma_) # Tilføj ordets lemma til tokens, hvis if-betingelse er opfyldt
                
    return(tokens)

In [None]:
# vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df = 0.05, max_df = 0.9, tokenizer = tokenizer) # dan vectorizerfunktion
transformed_documents = vectorizer.fit_transform(poltexts) # brug vectorizer på tekster

# Konverter fittet vectorizer til array
transformed_documents_as_array = transformed_documents.toarray()

# Konverter til data frame
tfidf_tk_df = pd.DataFrame(transformed_documents_as_array, columns = vectorizer.get_feature_names_out())

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(3, random_state = 142)

identified_clusters = kmeans.fit_predict(tfidf_tk_df)
identified_clusters

In [None]:
poltext_df['cluster_tfidf_tk'] = identified_clusters
poltext_df['cluster_tfidf_tk'] = poltext_df['cluster_tfidf_tk'].astype('str') # omdannes til string for at gøre den kategorisk

In [None]:
poltext_df.groupby(['party', 'cluster_tfidf_tk']).size() / poltext_df.groupby(['party']).size()

In [None]:
import plotnine
plotnine.options.figure_size = (12, 10)
from plotnine import ggplot, aes, geom_bar

(ggplot(data = poltext_df, mapping = aes(x = 'party', group = 'cluster_tfidf_tk', fill = 'cluster_tfidf_tk')) 
 + geom_bar(position = 'fill'))

## PCA på tekst

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=142)

pca_vecs = pca.fit_transform(tfidf_tk_df)

pca1 = pca_vecs[:, 0]
pca2 = pca_vecs[:, 1]

poltext_df['pca1'] = pca1
poltext_df['pca2'] = pca2

In [None]:
from plotnine import ggplot, aes, geom_point

(ggplot(data = poltext_df, mapping = aes(x = 'pca1', y = 'pca2', shape = 'cluster_tfidf_tk', colour = 'party')) 
 + geom_point())