In [None]:
import pandas as pd
import re
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import warnings
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.utils import resample
warnings.filterwarnings('ignore')
nltk.download("punkt_tab")
nltk.download("stopwords")

## Data cleaning: null and duplicated values, getting rid of stop words

In [17]:
df = pd.read_csv("YoutubeCommentsDataSet.csv")

df.info()
df.describe()

print("\nNULL count:")
print(df.isnull().sum())
print()

print("Unique rows:", df["Comment"].nunique())
print("Duplicate rows:", df.duplicated().sum())

duplicated_comments = df[df.duplicated()]
duplicated_comments.head(25)

print("Unique values in duplicated comments: ", duplicated_comments["Comment"].nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18408 entries, 0 to 18407
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Comment    18364 non-null  object
 1   Sentiment  18408 non-null  object
dtypes: object(2)
memory usage: 287.8+ KB

NULL count:
Comment      44
Sentiment     0
dtype: int64

Unique rows: 17871
Duplicate rows: 531
Unique values in duplicated comments:  346


Cleaning the data: 
What do we do with null values? NaN sentiments... -- discard.
What do we do with duplicted comments? Sometimes two or three in a row.


In [18]:
#Discard empty entries (44/18408) and duplicates (?)
df = df.dropna()
df = df.drop_duplicates()

## Adding features: 
- continuous sentiment score (nltk)
- comment length
- comment starts with I
- personal pronouns count
- number of words
- number of phrases (divided by "but", "and", "because")
- average word length
- average phrase length
- unique word ratio (unique words / number of words)

### Continuous sentiment score from nltk library

In [19]:
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Robert\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [20]:
#Dodajemy feature: Sentiment Score ntlk -- ciągły sentyment, wart. od -1 do 1.

sia = SentimentIntensityAnalyzer()
df['Sentiment_Score_nltk'] = df['Comment'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [21]:
df.head()

Unnamed: 0,Comment,Sentiment,Sentiment_Score_nltk
0,lets not forget that apple pay in 2014 require...,neutral,0.6774
1,here in nz 50 of retailers don’t even have con...,negative,0.836
2,i will forever acknowledge this channel with t...,positive,0.8718
3,whenever i go to a place that doesn’t take app...,negative,-0.8914
4,apple pay is so convenient secure and easy to ...,positive,0.6786


### Starts with I and Comment length

In [22]:
df['Starts_with_i'] = df['Comment'].str.startswith("i")
df['Comment_Length'] = df['Comment'].str.len()
df.head()

Unnamed: 0,Comment,Sentiment,Sentiment_Score_nltk,Starts_with_i,Comment_Length
0,lets not forget that apple pay in 2014 require...,neutral,0.6774,False,317
1,here in nz 50 of retailers don’t even have con...,negative,0.836,False,163
2,i will forever acknowledge this channel with t...,positive,0.8718,True,183
3,whenever i go to a place that doesn’t take app...,negative,-0.8914,False,450
4,apple pay is so convenient secure and easy to ...,positive,0.6786,False,135


### Personal_pronoun_count

In [23]:
personal_pronouns = {"i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves",
                     "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself",
                     "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
                     "theirs", "themselves"}
def count_personal_pronouns(comment):
    words = word_tokenize(comment.lower())
    return sum(1 for word in words if word in personal_pronouns)
    
df["Personal_Pronoun_count"] = df["Comment"].apply(lambda x: count_personal_pronouns(x))
df.head()

Unnamed: 0,Comment,Sentiment,Sentiment_Score_nltk,Starts_with_i,Comment_Length,Personal_Pronoun_count
0,lets not forget that apple pay in 2014 require...,neutral,0.6774,False,317,3
1,here in nz 50 of retailers don’t even have con...,negative,0.836,False,163,1
2,i will forever acknowledge this channel with t...,positive,0.8718,True,183,5
3,whenever i go to a place that doesn’t take app...,negative,-0.8914,False,450,13
4,apple pay is so convenient secure and easy to ...,positive,0.6786,False,135,2


### Sentence complexity features i.e.:
- Number_of_phrases
- Number_of_words
- Avg_phrase_length
- Avg_word_length
- Unique_word_ratio

In [24]:
connectors = ["and", "but", "so", "because", "which", "that", "then", "if", "or"]
def sentence_complexity(comment):
    words = word_tokenize(comment)
    phrases = re.split(r'\b(?:' + '|'.join(connectors) + r')\b', comment)

    num_words = len(words)
    num_phrases = len(phrases)

    avg_word_length = np.mean([len(word) for word in words]) if words else 0
    avg_phrase_length = num_words / num_phrases if num_phrases > 0 else 0
    unique_word_ratio = len(set(words)) / num_words if num_words > 0 else 0

    return {
        "Number_of_phrases": num_phrases,
        "Number_of_words": num_words,
        "Avg_phrase_length": avg_phrase_length,
        "Avg_word_length": avg_word_length,
        "Unique_word_ratio": unique_word_ratio
    }

df_complexity = df["Comment"].apply(lambda x: sentence_complexity(x))
df_complexity = pd.DataFrame(df_complexity.tolist())
df = df.reset_index(drop=True)
df_complexity = df_complexity.reset_index(drop=True)
df = pd.concat([df, df_complexity], axis = 1)
df.head()

Unnamed: 0,Comment,Sentiment,Sentiment_Score_nltk,Starts_with_i,Comment_Length,Personal_Pronoun_count,Number_of_phrases,Number_of_words,Avg_phrase_length,Avg_word_length,Unique_word_ratio
0,lets not forget that apple pay in 2014 require...,neutral,0.6774,False,317,3,4,57,14.25,4.578947,0.789474
1,here in nz 50 of retailers don’t even have con...,negative,0.836,False,163,1,3,33,11.0,4.090909,0.878788
2,i will forever acknowledge this channel with t...,positive,0.8718,True,183,5,3,31,10.333333,4.935484,0.903226
3,whenever i go to a place that doesn’t take app...,negative,-0.8914,False,450,13,7,99,14.142857,3.717172,0.616162
4,apple pay is so convenient secure and easy to ...,positive,0.6786,False,135,2,4,26,6.5,4.230769,0.961538


## Dividing into training, testing, and validation subsets: crossvalidation

In [8]:
df.to_pickle("processed_data.pkl")
