In [1]:
# import csv
import pandas as pd

df = pd.read_csv('cleaned_comments.csv', header=None)

print(df)
df_saved = df.copy()

                                                      0
0                                                  text
1     What can you immediately start implementing in...
2              Does playing online games improves focus
3     I wished that all comments goes to me i mean m...
4                      when you comed back you missed 5
...                                                 ...
3900                          Monitor your self thought
3901  Thank you for the message its touching and rea...
3902  Thank you once more. Unleashing Supper brain  ...
3903           Sharing this info with my 12 yr old son.
3904  First! Love the message sharing it with my twi...

[3905 rows x 1 columns]


In [2]:
# Downlaoad NLTK
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\PPTI
[nltk_data]     Java\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\PPTI
[nltk_data]     Java\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Case Folding
df = df.applymap(lambda s: s.lower() if type(s) == str else s)

df

Unnamed: 0,0
0,text
1,what can you immediately start implementing in...
2,does playing online games improves focus
3,i wished that all comments goes to me i mean m...
4,when you comed back you missed 5
...,...
3900,monitor your self thought
3901,thank you for the message its touching and rea...
3902,thank you once more. unleashing supper brain ...
3903,sharing this info with my 12 yr old son.


In [4]:
# Tokenization
import re
import string
from nltk.tokenize import word_tokenize

# Remove Number
df = df.applymap(lambda s: re.sub(r"\d+", "", s) if type(s) == str else s)

# Remove punctuation
# translator = str.maketrans("", "", string.punctuation)
# df = df.applymap(lambda s: s.translate(translator) if type(s) == str else s)

# Trim whitespace
df = df.applymap(lambda s: s.strip() if type(s) == str else s)

# Change whitespace in between text into a single whitespace
df = df.applymap(lambda s: re.sub('\s+',' ', s) if type(s) == str else s)

# Tokenize
df_tokens = df.applymap(lambda s: word_tokenize(s) if type(s) == str else s)

df_tokens

Unnamed: 0,0
0,[text]
1,"[what, can, you, immediately, start, implement..."
2,"[does, playing, online, games, improves, focus]"
3,"[i, wished, that, all, comments, goes, to, me,..."
4,"[when, you, comed, back, you, missed]"
...,...
3900,"[monitor, your, self, thought]"
3901,"[thank, you, for, the, message, its, touching,..."
3902,"[thank, you, once, more, ., unleashing, supper..."
3903,"[sharing, this, info, with, my, yr, old, son, .]"


In [5]:
# Stop Words Removal
from nltk.corpus import stopwords

en_list_stopwords = set(stopwords.words('english'))
id_list_stopwords = set(stopwords.words('indonesian'))

df_tokens = df_tokens.applymap(lambda s: [word for word in s if not word in id_list_stopwords and not word in en_list_stopwords] if type(s) == list else s)

df_tokens

Unnamed: 0,0
0,[text]
1,"[immediately, start, implementing, life, ?, ex..."
2,"[playing, online, games, improves, focus]"
3,"[wished, comments, goes, mean, im, right, ok, ..."
4,"[comed, back, missed]"
...,...
3900,"[monitor, self, thought]"
3901,"[thank, message, touching, real, .., graduatio..."
3902,"[thank, ., unleashing, supper, brain, reminded..."
3903,"[sharing, info, yr, old, son, .]"


In [6]:
# Install Stemming in Indonesian
!pip install Sastrawi




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

df_output = df_tokens.applymap(lambda s: [stemmer.stem(word) for word in s] if type(s) == list else s)
df_output

# Save the DataFrame as a CSV file
# df_output.to_csv('comment_file_video1_clean.csv', index=False)

Unnamed: 0,0
0,[text]
1,"[immedi, start, implement, life, ?, experi, le..."
2,"[play, onlin, game, improv, focu]"
3,"[wish, comment, goe, mean, im, right, ok, than..."
4,"[come, back, miss]"
...,...
3900,"[monitor, self, thought]"
3901,"[thank, messag, touch, real, .., graduation_cap]"
3902,"[thank, ., unleash, supper, brain, remind, rem..."
3903,"[share, info, yr, old, son, .]"


In [8]:
a = range(1, len(df) + 1)
a

range(1, 3906)

In [9]:
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

new_column_values = []
df_output[:3]
for index, row in df_output.iterrows():
    # Example: Let's create a new column 'SquaredAge' that contains the square of the 'Age' column
    # squared_age = row['Age'] ** 2
    # print(df_output[0][index])
    try:
        sentence = ' '.join(df_output[0][index])
        sentiment_scores = sia.polarity_scores(sentence)
        # print(' '.join(df_output[0][index]))
        df_output[0][index] = sentence
        value = 2
        if sentiment_scores['neg'] > sentiment_scores['pos'] and sentiment_scores['neg'] > sentiment_scores['neu']:
            value = 0
        elif sentiment_scores['neu'] > sentiment_scores['pos']:
            value = 1
    except Exception as e:
        value = -1
    
    new_column_values.append(value)
# new_column_values.shape()
df_output['label'] = new_column_values
df_output
df_output['text'] = df_output[0]
df_output = df_output.drop(0, axis=1)
df_output = df_output.dropna(subset=['text'])
print(df_output)
df_output.to_csv('new_comment_labeled_raw.csv', index=False)

[nltk_data] Downloading package vader_lexicon to C:\Users\PPTI
[nltk_data]     Java\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


      label                                               text
0         1                                               text
1         1  immedi start implement life ? experi learn fas...
2         1                        play onlin game improv focu
3         2  wish comment goe mean im right ok thank match ...
4         1                                     come back miss
...     ...                                                ...
3900      1                               monitor self thought
3901      1          thank messag touch real .. graduation_cap
3902      1  thank . unleash supper brain remind remain chi...
3903      1                            share info yr old son .
3904      1  first ! love messag share twin ! one brain inj...

[3905 rows x 2 columns]
