In [2]:
# Import packages in alphabetical order
import pandas as pd
import tiktoken
import os
import openai
import duckdb
import owlready2
import sklearn

import gensim
import matplotlib.pyplot as plt
import nltk
import pickle
import re
import spacy
import string

# Import packages in alphabetical order to avoid duplicates
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from transformers import BertModel, BertTokenizer
from wordcloud import WordCloud

In [2]:
#Load duckdb
con = duckdb.connect('/home/roland/Projects/timothy/Master-Thesis/isrecon.duckdb', read_only = True)

In [3]:
#Load sentences from duckdb
sentences_df = con.execute("""
                        SELECT article_id, sentence_id, sentence
                        FROM sentences""").fetchdf()
sentences_df.head()

Unnamed: 0,article_id,sentence_id,sentence
0,1,1_2_21,Examining interdependence between product user...
1,1,1_22_48,Abstract. Firm - sponsored online user communi...
2,1,1_48_65,Product users and host firm employees comprise...
3,1,1_65_95,The online user community provides a forum whe...
4,1,1_95_125,Extant research on online user communities has...


In [4]:
# Group by 'article_id' and concatenate 'sentence'
grouped_sentences = sentences_df.groupby('article_id')['sentence'].apply(' '.join).reset_index()

In [5]:
#Preview grouped sentences
grouped_sentences.head()

Unnamed: 0,article_id,sentence
0,1,Examining interdependence between product user...
1,2,Information management as an enabler of knowle...
2,3,Action Research and Critical Rationalisationis...
3,4,Integration of Supply Chain Execution in B2B M...
4,5,Association for Information Systems AIS Electr...


In [6]:
#Preview the whole first row
print(grouped_sentences.iloc[0]['sentence'])

Examining interdependence between product users users and employees in online user communities : The role of employee - generated content Abstract. Firm - sponsored online user communities have become product innovation and support hubs of strategic importance to firms . Product users and host firm employees comprise the participants of firm - sponsored online user communities . The online user community provides a forum wherein the product users users and firm employees discuss questions , problems or issues resulting from the use of host firms ' products . Extant research on online user communities has largely focused on either product users users or employees and has examined the various dynamics that ensue from each entity 's community participation . This paper seeks to investigate the interdependence between the two entities in the communities and , in particular , how product users users ' reading of employee - generated content influences subsequent knowledge contribution by pr

In [7]:
#Import text processor
def text_processing(df, col):
    # Make sure to handle NaN values by converting them to an empty string
    temp_df = df[col].fillna('')
    # 1. Convert to lower case
    temp_df = temp_df.apply(lambda x: x.lower() if isinstance(x, str) else x)
    # 2. Remove punctuation
    temp_df = temp_df.apply(lambda x: re.sub('[^a-zA-Z]', ' ', x) if isinstance(x, str) else x)
    # 3. Remove special characters and digits
    temp_df = temp_df.apply(lambda x: re.sub("(\\d|\\W)+", " ", x) if isinstance(x, str) else x)
    # 4. Remove single-letter words
    temp_df = temp_df.apply(lambda x: re.sub(r"\b[a-zA-Z]\b", "", x) if isinstance(x, str) else x)
    # 5. Replace multiple spaces with a single space
    temp_df = temp_df.apply(lambda x: re.sub(r"\s+", " ", x) if isinstance(x, str) else x)
    return temp_df

In [8]:
#Process 'sentence' column
grouped_sentences["processed_sentence"] = text_processing(grouped_sentences,'sentence')

#Preview the new grouped_sentences dataframe
grouped_sentences.head()

Unnamed: 0,article_id,sentence,processed_sentence
0,1,Examining interdependence between product user...,examining interdependence between product user...
1,2,Information management as an enabler of knowle...,information management as an enabler of knowle...
2,3,Action Research and Critical Rationalisationis...,action research and critical rationalisationis...
3,4,Integration of Supply Chain Execution in B2B M...,integration of supply chain execution in marke...
4,5,Association for Information Systems AIS Electr...,association for information systems ais electr...


In [9]:
#Preview the whole first row of the processed sentence
print(grouped_sentences.iloc[0]['processed_sentence'])

examining interdependence between product users users and employees in online user communities the role of employee generated content abstract firm sponsored online user communities have become product innovation and support hubs of strategic importance to firms product users and host firm employees comprise the participants of firm sponsored online user communities the online user community provides forum wherein the product users users and firm employees discuss questions problems or issues resulting from the use of host firms products extant research on online user communities has largely focused on either product users users or employees and has examined the various dynamics that ensue from each entity community participation this paper seeks to investigate the interdependence between the two entities in the communities and in particular how product users users reading of employee generated content influences subsequent knowledge contribution by product users users as well as emplo

In [10]:
grouped_sentences.head()

Unnamed: 0,article_id,sentence,processed_sentence
0,1,Examining interdependence between product user...,examining interdependence between product user...
1,2,Information management as an enabler of knowle...,information management as an enabler of knowle...
2,3,Action Research and Critical Rationalisationis...,action research and critical rationalisationis...
3,4,Integration of Supply Chain Execution in B2B M...,integration of supply chain execution in marke...
4,5,Association for Information Systems AIS Electr...,association for information systems ais electr...


In [11]:
import pickle

In [12]:
#Save the grouped_sentences dataframe as pickle
with open('Datasets/grouped_sentences.pkl', 'wb') as f:
    pickle.dump(grouped_sentences, f)

In [3]:
#Check disk quota
os.system('quota -s')

Disk quotas for user roland (uid 1002): 
     Filesystem   space   quota   limit   grace   files   quota   limit   grace
   /dev/nvme0n1    226G      0K    300G            815k       0       0        


0

In [4]:
#Display biggest files in current quota
os.system('du -a /home/roland/Projects/richard | sort -n -r | head -n 10')

4282496	/home/roland/Projects/richard
4282252	/home/roland/Projects/richard/grouped_sentences.pickle
144	/home/roland/Projects/richard/data_load.ipynb
88	/home/roland/Projects/richard/LLM_SOLAR-10.7B.ipynb
8	/home/roland/Projects/richard/data_processing.ipynb
0	/home/roland/Projects/richard/LLM_quantum-dpo-v0.1.ipynb


0

In [8]:
#Clear cache in current quota
os.system('rm -rf /home/roland/Projects/richard/*')

0

In [3]:
#Check current memory usage
os.system('free -m')

              total        used        free      shared  buff/cache   available
Mem:        1546783       65278     1159084          71      322419     1473541
Swap:          8191           0        8191


0

In [13]:
#Free some memory usage
os.system('sudo sysctl -w vm.drop_caches=3')


sudo: a terminal is required to read the password; either use the -S option to read from standard input or configure an askpass helper


256