In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import nltk
import nltk.corpus
from gensim.models import Word2Vec
import os
from wordcloud import WordCloud
import collections
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


stop_words = set(stopwords.words("english"))

In [27]:
# get words from the text corpus

from nltk.corpus import PlaintextCorpusReader
wordlists = PlaintextCorpusReader('da/processed', '.*')

da_words = []

for file in wordlists.fileids():
        da_words.append(wordlists.words(file))

In [28]:
# get the rosetta data

data_raw = pd.read_excel("filtered_project_ids_data.xlsx")

In [29]:
# Get main columns and filter rows without headings
data = data_raw[["ProjectID", "Procestrin", "Overskrift", "Beskrivelse"]]

In [43]:
# remove danish words, stopwords, and extra words and prep data for use

filter_words = ['able','also','and','amount','around','away','better','carry','catch',
                'certain','close','correct','contact','could','create','different',
                'dont','easier','easy','etc','fast','find','friendly','generate',
                'get','go','good','help','idea','instead','like','nice','made','make',
                'may','maybe','might','must','need','new','non','old','one','picture',
                'possible','put','random','reduce','revenue','secondary','send',
                'small','something','specific','strong','stuff','take','things',
                'use','used','using','without','word','would']

def filter_danish(text):
       filter = " ".join(w for w in nltk.word_tokenize(text) if w.casefold() not in da_words and 
                       w.casefold() not in filter_words and w.isalpha())
       return filter

# combine heading and description
title_strings = data['Overskrift'].astype('str')
description_strings = data['Beskrivelse'].astype('str')
data['Combined'] = title_strings + " \n\n" + description_strings

data['Combined'] = data['Combined'].apply(filter_danish)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Combined'] = title_strings + " \n\n" + description_strings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Combined'] = data['Combined'].apply(filter_danish)


In [None]:
# tokenize filtered text, removing stopwords

def tokenize_text(text, tokenizer):
    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stop_words]  # Remove stopwords
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

data['Tokens'] = data['Combined'].map(lambda x: tokenize_text(x, nltk.word_tokenize))

## Exploration of the Data

In [57]:
project_ids = data['ProjectID'].unique()
print(project_ids)

[39 32 45 19  2 43 25  7 23 47 46 13]


In [58]:
data['Procestrin'].unique()

array([1, 2, 4, 3])

## General TF-IDF

In [62]:
tfidf_vectorizer = TfidfVectorizer(input='content', stop_words='english', lowercase=False)

In [70]:
tfidf_vectors = {key: [] for key in project_ids}

for id in project_ids:
        tfidf_vectors[id].append(tfidf_vectorizer.fit_transform(data.loc[data['ProjectID'] == id, 'Tokens'].astype('str')))

In [48]:
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), index=data.index, columns=tfidf_vectorizer.get_feature_names_out())

In [49]:
display(tfidf_df)

Unnamed: 0,ADP,AI,ALL,AT,ATP,ATPase,About,Absorb,Accelerometer,According,...,years,yoghurt,yogurts,yor,youth,zapping,zipper,zone,zones,zwift
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
