In [1]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib

# Data cleaning: null and duplicated values, getting rid of stop words

In [2]:
df = pd.read_csv("YoutubeCommentsDataSet.csv")

df.info()
df.describe()

print("\nNULL count:")
print(df.isnull().sum())
print()

print("Unique rows:", df["Comment"].nunique())
print("Duplicate rows:", df.duplicated().sum())

duplicated_comments = df[df.duplicated()]
duplicated_comments.head(25)

print("Unique values in duplicated comments: ", duplicated_comments["Comment"].nunique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18408 entries, 0 to 18407
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Comment    18364 non-null  object
 1   Sentiment  18408 non-null  object
dtypes: object(2)
memory usage: 287.8+ KB

NULL count:
Comment      44
Sentiment     0
dtype: int64

Unique rows: 17871
Duplicate rows: 531
Unique values in duplicated comments:  346


Cleaning the data: 
What do we do with null values? NaN sentiments... -- discard.
What do we do with duplicted comments? Sometimes two or three in a row.


In [3]:
#Discard empty entries (44/18408) and duplicates (?)
df = df.dropna()
df = df.drop_duplicates()

# Adding features: 
- continuous sentiment score (nltk)
- sentence complexity score (nltk)
- comment length
- comment starts with I



### Continuous sentiment score from nltk library

In [4]:
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lila/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
#Dodajemy feature: Sentiment Score ntlk -- ciągły sentyment, wart. od -1 do 1.

sia = SentimentIntensityAnalyzer()
df['Sentiment_Score_nltk'] = df['Comment'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [6]:
df.head()

Unnamed: 0,Comment,Sentiment,Sentiment_Score_nltk
0,lets not forget that apple pay in 2014 require...,neutral,0.6774
1,here in nz 50 of retailers don’t even have con...,negative,0.836
2,i will forever acknowledge this channel with t...,positive,0.8718
3,whenever i go to a place that doesn’t take app...,negative,-0.8914
4,apple pay is so convenient secure and easy to ...,positive,0.6786


### Starts with I and Comment length

In [7]:
df['Starts_with_i'] = df['Comment'].str.startswith("i")
df['Comment_Length'] = df['Comment'].str.len()

sentiment_mapping = {'negative': -1, 'neutral': 0, 'positive': 1}
df['Sentiment_num'] = df['Sentiment'].map(sentiment_mapping)

df.head()

Unnamed: 0,Comment,Sentiment,Sentiment_Score_nltk,Starts_with_i,Comment_Length,Sentiment_num
0,lets not forget that apple pay in 2014 require...,neutral,0.6774,False,317,0
1,here in nz 50 of retailers don’t even have con...,negative,0.836,False,163,-1
2,i will forever acknowledge this channel with t...,positive,0.8718,True,183,1
3,whenever i go to a place that doesn’t take app...,negative,-0.8914,False,450,-1
4,apple pay is so convenient secure and easy to ...,positive,0.6786,False,135,1


### Sentence complexity score from nltk library

## Dividing into training, testing, and validation subsets: crossvalidation

In [8]:
df.to_pickle("processed_data.pkl")