In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import nltk
import nltk.corpus
import os
import collections
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amanshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/amanshah/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# get words from the text corpus

from nltk.corpus import PlaintextCorpusReader
wordlists = PlaintextCorpusReader('da/processed', '.*')

da_words = []

for file in wordlists.fileids():
        da_words.append(wordlists.words(file))

In [3]:
# get the rosetta data

data_raw = pd.read_excel("filtered_project_ids_data.xlsx")

In [4]:
# Get main columns and filter rows without headings
data = data_raw[["ProjectID", "Procestrin", "Overskrift", "Beskrivelse"]]

In [5]:
# remove danish words, stopwords, and extra words and prep data for use

def filter_danish(text):
       return " ".join(w.lower() for w in nltk.wordpunct_tokenize(text) 
                       if w.casefold() not in da_words and w.isalpha() and w != 'nan')

# combine heading and description
title_strings = data['Overskrift'].astype('str')
description_strings = data['Beskrivelse'].astype('str')
data['Combined'] = title_strings + " \n\n" + description_strings

data['Combined'] = data['Combined'].apply(filter_danish)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Combined'] = title_strings + " \n\n" + description_strings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Combined'] = data['Combined'].apply(filter_danish)


In [6]:
# tokenize filtered text, removing stopwords

def tokenize_text(text, tokenizer):
    tokens = tokenizer(text)  # Get tokens from text
    tokens = [t for t in tokens if not t in stop_words]  # Remove stopwords
    tokens = [t for t in tokens if len(t) > 1]  # Remove short tokens
    return tokens

data['Tokens'] = data['Combined'].map(lambda x: tokenize_text(x, nltk.word_tokenize))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Tokens'] = data['Combined'].map(lambda x: tokenize_text(x, nltk.word_tokenize))


In [7]:
# filter non-nouns from the tokens

tags = data['Tokens'].map(lambda x: nltk.pos_tag(x))

tag_rejects = ['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'MD', 'PDT', 'POS', 'PRP', 'PRP$',
               'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 
               'VBZ', 'WDT', 'WP', 'WP$', 'WRB']

# keep foreign words, adjectives, nouns, and interjects
tag_keeps = ['FW', 'JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS', 'UH'] 

# filter the tags
def filter_non_nouns(sentence):
        new_sentence = []
        for tuple in sentence:
                if(tuple[1] in tag_keeps):
                        new_sentence.append(tuple)
        return new_sentence

filtered_tags = tags.map(lambda x: filter_non_nouns(x))

# get the tokens back
def tokens_from_tags(sentence):
        new_sentence = []
        for tuple in sentence:
                new_sentence.append(tuple[0])
        return new_sentence

data['Tokens'] = filtered_tags.map(lambda x: tokens_from_tags(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Tokens'] = filtered_tags.map(lambda x: tokens_from_tags(x))


In [11]:
# lemmatize the tokens, remove super short or long results

wnl = WordNetLemmatizer()

def lemmatize(sentence):
        new_sentence = []
        for token in sentence:
                lemma = wnl.lemmatize(token)

                # remove super short or super long words
                if (len(lemma) >= 2 and len(lemma) <= 15):
                        new_sentence.append(wnl.lemmatize(lemma))
        return new_sentence

data['Tokens'] = data['Tokens'].map(lambda x: lemmatize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Tokens'] = data['Tokens'].map(lambda x: lemmatize(x))


In [13]:
# generate bigrams, replace as token options

from gensim.models.phrases import Phrases
 
# get tokens as a list
words = []
for sentence in data['Tokens']:
        for token in sentence:
                words.append(token)
               
# Bigram using Phraser Model              
bigram_model = Phrases(data['Tokens'], min_count = 3, threshold = 10)

data['Tokens'] = data['Tokens'].map(lambda x: bigram_model[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Tokens'] = data['Tokens'].map(lambda x: bigram_model[x])


## Exploration of the Data

In [15]:
display(data)

Unnamed: 0,ProjectID,Procestrin,Overskrift,Beskrivelse,Combined,Tokens
0,39,1,Combi-material-design,"Use a combination of materials, such as brick,...",combi material design use a combination of mat...,"[combi, material, design, use, combination, ma..."
1,39,1,Sterilization with UV light,UVC light can be used to quickly disinfect and...,sterilization with uv light uvc light can be u...,"[sterilization, uv_light, uvc, light, disinfec..."
2,39,1,seperation by oscillation,Protein powder is introduced to a plug flow tu...,seperation by oscillation protein powder is in...,"[seperation, oscillation, protein_powder, plug..."
3,32,1,02 - polar bear,wrap it up in insulating material/fat,polar bear wrap it up in insulating material fat,"[polar, bear, wrap, material, fat]"
4,32,1,03 - bee,bee wax is structured - some kind of stuctured...,bee bee wax is structured some kind of stuctur...,"[bee_bee, wax, kind, holed, material, fix, pro..."
...,...,...,...,...,...,...
1715,23,3,Smart clothes,"""Smart clothes"" that collect data on the users...",smart clothes smart clothes that collect data ...,"[smart_clothes, smart_clothes, data, user, fas..."
1716,23,3,Virtual fitting room,"""Virtual fitting room"" in which people can try...",virtual fitting room virtual fitting room in w...,"[virtual_fitting, room, virtual_fitting, room,..."
1717,23,3,You choose what you wear based on advertisemen...,"When browsing Social Media and outlets, you ca...",you choose what you wear based on advertisemen...,"[choose_wear, advertisement_instagram, social_..."
1718,23,3,Study online shopping trends,"An easy way of trying to predict fashion, is l...",study online shopping trends an easy way of tr...,"[study, online, trend, easy, way_predict, fash..."


In [16]:
project_ids = data['ProjectID'].unique()
print(project_ids)

[39 32 45 19  2 43 25  7 23 47 46 13]


In [17]:
data['Procestrin'].unique()

array([1, 2, 4, 3])

## General TF-IDF

In [26]:
tfidf_vectorizer = TfidfVectorizer(input='content', lowercase=False)

In [27]:
# get data for each project

data_by_proj = {}
for id in project_ids:
        data_by_proj[id] = data.loc[data['ProjectID'] == id, ['Procestrin', 'Tokens']]

In [85]:
data_by_proj[43]

Unnamed: 0,Procestrin,Tokens
56,1,"[drone, drone, air, detect, threat, tank, resp..."
74,1,"[spoiler, tank, spoiler, tank, racecar]"
76,1,"[remote, crew, tank, tank_compact, agile, armo..."
306,1,"[electric, engine, diesel, big, heavy, demand,..."
307,1,"[gas, engine, diesel, big, heavy, demand, lot,..."
...,...,...
1530,3,"[tank, leg, mechanical, leg, agile, many_terra..."
1531,3,"[autonomous, deep_learning, computer_vision, s..."
1532,3,"[omnidirectional, track, track, direction, dri..."
1649,4,[]


In [86]:
data_by_proj[13]

Unnamed: 0,Procestrin,Tokens
707,1,"[electrical, pizzaoven, energy, electricity]"
716,1,"[isolation, enclosure, door, gate, heat, child]"
721,1,"[round, top, better, sharp, corner, stress, he..."
746,1,"[stainless, steel, holster, create, stainless,..."
780,2,"[create, simple, shape, pizza, dirt, debris, a..."
...,...,...
1365,3,"[temperature, monitoring, use, smart, thermome..."
1367,3,"[cabinet, cabinet, oven, base, storage, cookin..."
1619,3,"[use, pumice, peace, cement, stabilized, pumic..."
1688,4,[]


#### Unable to perform TFIDF on Projects 13 and 43 as their submissions for Stage 4 included stopwords and verbs, no nouns.

In [67]:
# function to get 10 most important terms per stage, per project

def get_ten_most_freq(id, stage):
        tfidf_vectors = tfidf_vectorizer.fit_transform(data_by_proj[id].loc[data_by_proj[id]['Procestrin'] == stage, 'Tokens'].astype('str'))
        tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
        tfidf_df = tfidf_df.stack().reset_index()
        tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'entry','level_1': 'term'})
        return tfidf_df.sort_values(by=['tfidf'], ascending=False).head(10).reset_index()

In [54]:
# get most important terms per stage for a project

def get_important_freq(id):
        stage_1 = get_ten_most_freq(id, 1)
        stage_2 = get_ten_most_freq(id, 2)
        stage_3 = get_ten_most_freq(id, 3)
        stage_4 = get_ten_most_freq(id, 4)

        final_df = pd.DataFrame()
        final_df['stage_1'] = stage_1['term']
        final_df['stage_1_freq'] = stage_1['tfidf']

        final_df['stage_2'] = stage_2['term']
        final_df['stage_2_freq'] = stage_2['tfidf']

        final_df['stage_3'] = stage_3['term']
        final_df['stage_3_freq'] = stage_3['tfidf']

        final_df['stage_4'] = stage_4['term']
        final_df['stage_4_freq'] = stage_4['tfidf']
        return final_df

In [69]:
get_important_freq(39)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,uv_light,1.0,antimicrobial_enzyme,0.726634,tube,0.475753,tube,0.471405
1,magnetic_field,0.908697,chemical,0.692454,droplet,0.462625,small_well,0.353553
2,microoganisms,0.816497,sound,0.676158,generate,0.462625,surface_many,0.353553
3,antimicrobial_enzyme,0.768896,sonication,0.654344,microbial_cell,0.462625,powder_spread,0.353553
4,chemical,0.747575,organism_bioluminescent,0.640542,system,0.462625,growth_nutrient,0.353553
5,vibratory,0.739678,current,0.552763,temperature,0.405347,condition_bacterial,0.353553
6,radiation,0.721761,filter_antibody,0.544367,high_pressure,0.405347,colour,0.353553
7,bed,0.714966,nucleic,0.50892,protein_powder,0.379359,well_good,0.353553
8,steam,0.713494,acid,0.50892,pathogen,0.363068,seperation_protein,0.353553
9,temperature,0.708585,droplet,0.495221,specific,0.363068,plug_flow,0.235702


In [70]:
get_important_freq(32)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,cheetos,1.0,sand,0.847526,pack,0.755929,ceal_gland,0.447214
1,surface,0.946867,case,0.801679,cotton,0.706051,metal_package,0.447214
2,stone,0.894427,cotton,0.715081,egg,0.695445,material_thereby,0.447214
3,sand,0.878105,pack,0.708803,cheetos,0.625662,magnetic,0.447214
4,huge,0.875343,egg,0.70818,bumperballz,0.580624,chock,0.447214
5,paper,0.834596,sack,0.703684,air,0.580624,chock,0.0
6,quit,0.82502,tape,0.683674,wax,0.559775,material_thereby,0.0
7,air,0.814832,cheetos,0.645721,magnetic,0.538138,magnetic,0.0
8,grass,0.812158,bumperballz,0.618894,foam_flower,0.513914,chock,0.0
9,quit,0.781651,print,0.585502,fake_flower,0.513914,ceal_gland,0.0


In [71]:
get_important_freq(45)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,spray,1.0,dissolvalbe_label,1.0,dissolvalbe_label,1.0,shot,0.442326
1,bamboo_tube,1.0,bamboo_tube,1.0,eppendorf_tube,0.733666,capsule,0.294884
2,shot,1.0,shot_pill,1.0,soft_plastic,0.651787,idea,0.294884
3,cigarette,1.0,bamboo,0.896836,bamboo,0.601062,biodegradable,0.294884
4,part,1.0,marketing,0.843988,magnet,0.55193,normal,0.147442
5,shot_pill,1.0,icecream,0.805223,container_refillable,0.549748,contain,0.147442
6,dissolvalbe_label,1.0,camel_bag,0.796514,slush_ice,0.549748,encapsulates,0.147442
7,icecream,0.939719,asma_spray,0.779906,slush_ice,0.549748,film,0.147442
8,bamboo,0.913798,plastic,0.725852,system_slushice,0.549748,fish_pill,0.147442
9,asma_spray,0.905596,separation,0.707107,container_refillable,0.549748,good,0.147442


In [72]:
get_important_freq(19)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,accelerometer,1.0,clip_pedal,0.718768,glass,0.67714,side,0.408248
1,energy_storage,1.0,frame,0.711382,ceramic,0.591175,wheel_cover,0.408248
2,tyre,1.0,mudguard,0.610008,image,0.569272,monitor_efficiency,0.316228
3,chassis,1.0,sail,0.607891,rider_position,0.569272,efficiency,0.316228
4,chain,1.0,ceramic,0.602877,bottle,0.553167,residual,0.316228
5,brake,1.0,wheel_cover,0.591457,handlebar,0.526923,concept,0.316228
6,intelligent,1.0,vibration,0.589621,bearing,0.47656,predictive_maintenance,0.316228
7,bearing,1.0,rider_position,0.571737,sensor,0.47656,efficiency_noise,0.316228
8,rubber,0.84169,image,0.571737,device,0.47656,noise_sensor,0.316228
9,paint,0.84169,chain,0.566358,water,0.470242,higher,0.316228


In [73]:
get_important_freq(2)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,gas_chromatography,1.0,fail,1.0,newspaper,0.635808,biosensor,0.319801
1,mass_spectrometry,1.0,item,0.787136,madress,0.510897,antibody,0.213201
2,heat,0.919582,delete,0.746061,charcoal,0.501259,bind_specific,0.213201
3,dog,0.894427,charcoal,0.680712,madress,0.501259,specific,0.213201
4,device_glove,0.894427,laser,0.668595,protective_layer,0.484308,molecule,0.213201
5,water,0.894427,fail,0.665877,differet,0.448215,measurable_signal,0.213201
6,strechyness,0.889119,core,0.656573,temperature,0.439444,feces,0.213201
7,density,0.835777,powder,0.643308,sample,0.421623,enzyme,0.213201
8,puff,0.834507,matress,0.641097,mattress,0.411562,dna,0.213201
9,vacuum_cleaner,0.815553,newspaper,0.620879,cmos,0.409318,device,0.213201


In [75]:
get_important_freq(25)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,tile,1.0,steel,0.740865,duplicate,0.763965,wax,0.675053
1,weed,1.0,steel,0.740865,wax,0.682523,wax,0.675053
2,fungi,0.923947,brush_weed,0.671654,ultrasound_brush,0.645257,bee,0.225018
3,harm,0.923947,brush_weed,0.671654,sand,0.572389,cosmetic,0.225018
4,rabbit,0.923947,water,0.653872,toothbrush,0.529328,candle,0.225018
5,system,0.911921,obstacle,0.620219,water,0.52144,rubber,0.225018
6,blow,0.901668,bacteria,0.604189,qr_code,0.469796,vegetable,0.225018
7,wax,0.863626,marker,0.602624,work,0.369018,natural,0.225018
8,gas,0.839948,animal,0.558511,robot,0.365795,bee,0.225018
9,battery,0.830893,toothbrush,0.530298,zone,0.317438,natural,0.225018


In [76]:
get_important_freq(7)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,pool,0.888587,pool,0.831521,screen,0.745461,patient,0.408248
1,banana,0.865074,robot,0.801071,joystick,0.704334,patient_caretaker,0.408248
2,dog,0.844563,screen,0.781264,different_type,0.682833,caretaker,0.204124
3,robot,0.822078,blood_circulation,0.774979,fakir_stick,0.661397,sensor,0.204124
4,joystick,0.813515,joystick,0.743639,sea_anemone,0.632456,position,0.204124
5,pancake,0.812377,different_type,0.728279,bedtop,0.632456,pressure,0.204124
6,vacuum,0.804793,bed,0.728181,uv_light,0.58306,release_pressure,0.204124
7,screen,0.798181,sheet,0.697859,stem_cell,0.56356,see,0.204124
8,gravity,0.795786,constant,0.671226,bed_swing,0.516835,sign,0.204124
9,monkey,0.777376,float,0.642897,boat,0.516835,connects,0.204124


In [77]:
get_important_freq(23)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,sunflower,1.0,influencers,0.79447,mind,0.640549,mind,0.57735
1,eliminate,1.0,different,0.766277,combobulator_machine,0.640549,fashion,0.57735
2,personalize,1.0,hashtags,0.76195,democracy,0.636203,combobulator_machine,0.57735
3,giraffe,1.0,color,0.682338,smart_clothes,0.586223,outfit_way,0.5
4,spoon,1.0,mind,0.68089,room,0.566927,communism_longer,0.5
5,modify,1.0,combobulator_machine,0.68089,virtual_fitting,0.566927,company_waste,0.5
6,pinecone,1.0,democracy,0.676425,piece,0.539897,fashion_everybody,0.5
7,television,1.0,smart_clothes,0.624718,communism_longer,0.5,outlet_pick,0.447214
8,stapler,1.0,celebrity,0.604817,company_waste,0.5,post_advertisement,0.447214
9,rearrange,1.0,agency,0.584505,fashion_everybody,0.5,social_medium,0.447214


In [78]:
get_important_freq(47)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,jewellery,1.0,picture,0.922604,paint_brick,0.796685,stress_use,0.316228
1,picture,0.91344,paint_brick,0.797831,picture,0.628161,stress_power,0.316228
2,waldo,0.913136,game,0.79326,magnetic_levitation,0.614747,small_light,0.316228
3,magnetics,0.913136,model_reaction,0.691285,brick,0.604395,simple_electronic,0.316228
4,movement,0.891847,lego,0.679765,lego,0.562839,piezoelectric_material,0.316228
5,air,0.831818,mine,0.674939,organic,0.544833,lego_brick,0.316228
6,speed,0.827809,magnetic_levitation,0.633233,reaction_capability,0.544833,generate_current,0.316228
7,paint_brick,0.818814,current,0.624069,color_water,0.518517,electricity_mechanical,0.316228
8,mirror,0.818256,puzzle,0.571353,different_environment,0.518517,due_mechanical,0.316228
9,ceiling,0.771954,model,0.561231,fluid,0.503267,component,0.316228


In [79]:
get_important_freq(46)

Unnamed: 0,stage_1,stage_1_freq,stage_2,stage_2_freq,stage_3,stage_3_freq,stage_4,stage_4_freq
0,volunteer,1.0,mascot,0.864087,festival,0.649737,trampoline_people,0.620116
1,sponsor,1.0,people,0.744582,jump,0.623113,jump,0.620116
2,glow,0.922524,tower,0.707107,trampoline_people,0.623113,advertisement_screen,0.57735
3,mascot,0.910163,advertising,0.707107,stage_concert,0.604392,system_charity,0.57735
4,transparent,0.851013,impossible_system,0.707107,advertisement_screen,0.604392,stage_concert,0.57735
5,game,0.813066,money,0.707107,build_machine,0.57735,system_charity,0.57735
6,costume,0.787315,festival,0.646334,banner_alert,0.57735,advertisement_screen,0.57735
7,advertising,0.774277,trampoline_people,0.614038,reward_form,0.57735,stage_concert,0.57735
8,tower,0.757502,jump,0.614038,hire_plane,0.57735,discount_bottle,0.57735
9,ask,0.736235,advertisement_screen,0.598945,attract_attention,0.57735,attract_attention,0.57735
