In [2]:
import matplotlib as plt
import pandas as pd
import numpy as np
import sklearn as sk
import nltk
import random
from gensim.models import Word2Vec
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
data_path = "text_comments.csv"

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/renath/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


We first explore the data in `text_comments.csv` by printing some rows.

In [3]:
# Create chunker
chunk_size = 40000
tf_chunks = pd.read_csv(data_path, chunksize=chunk_size)

# Display the first 10 rows
print(tf_chunks.get_chunk(10))

           id  score    link_id                author          subreddit  \
0  t1_ftjl56l      4  t3_gzv6so             mega_trex  BeautyGuruChatter   
1  t1_ftjpxmc      6  t3_gzv6so             [deleted]  BeautyGuruChatter   
2  t1_gzzxfyt     22  t3_nodb9e             divadream  BeautyGuruChatter   
3  t1_gzzy7nc     92  t3_no6qaj  Ziegenkoennenfliegen  BeautyGuruChatter   
4  t1_h00tpbp     82  t3_nolx7p       meowrottenralph  BeautyGuruChatter   
5  t1_ftlamij      1  t3_h0an62       somethingelse19  BeautyGuruChatter   
6  t1_h01dtz3     28  t3_noo5e0           sasukesbutt  BeautyGuruChatter   
7  t1_h01fl3q      2  t3_nn2hz7             Mika_Kyle  BeautyGuruChatter   
8  t1_ftll1qn      6  t3_h0dpxq             [deleted]  BeautyGuruChatter   
9  t1_ftlsbtj      2  t3_h0an62            angelicad6  BeautyGuruChatter   

                                                body  created_utc  
0  Does anyone have a good cruelty free one? The ...   1591755558  
1  (stares at my soft glam 

We explore how many comments a subreddit could have.

In [4]:
first_chunk = next(tf_chunks)
first_chunk["subreddit"].value_counts(dropna=False)

MLBTheShow           13415
BeautyGuruChatter     7638
MensRights            6321
TrueOffMyChest        4769
SaltLakeCity          3560
OurPresident          2761
Cosmere               1536
Name: subreddit, dtype: int64

Next, we introduce the sentiment analysis tool called VADER (Valence Aware Dictionary and sEntiment Reasoner). The sentiment scores are represented as a dictionary with the following keys:

    'neg': Negative sentiment score (proportion of the text that is negative)
    'neu': Neutral sentiment score (proportion of the text that is neutral)
    'pos': Positive sentiment score (proportion of the text that is positive)
    'compound': Compound sentiment score (a normalized, weighted composite score that represents the overall sentiment of the text)


In [5]:
# Sample text (replace this with your own data)
text = "I really enjoyed working with my team. They are so helpful and supportive."

# Initialize the sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Get the sentiment scores
sentiment_scores = sid.polarity_scores(text)

# Display the sentiment scores
print("Sentiment Scores:", sentiment_scores)

Sentiment Scores: {'neg': 0.0, 'neu': 0.482, 'pos': 0.518, 'compound': 0.8649}


Now, we take a sample of the dataset 100 to 1. (Do not run this, we have made the sample already)

In [None]:
# Specify the number of rows to read in each chunk
chunk_size = 1000

# Specify the number of rows to sample from each chunk
sample_size = 10

# Specify the path for the output CSV file
output_csv_path = 'sample.csv'

# Create a CSV writer object for the first chunk
first_chunk = True

# Create a CSV reader object
csv_reader = pd.read_csv(data_path, chunksize=chunk_size, encoding='utf-8')

# Iterate over each chunk, sample 10 rows, and append them to the output CSV file
for i, chunk in enumerate(csv_reader):
    try:
      # Sample 10 rows from each chunk
      sampled_chunk = chunk.sample(n=sample_size, random_state=42)  # Adjust random_state as needed

      # Append the sampled chunk to the output CSV file
      sampled_chunk.to_csv(output_csv_path, mode='a', index=False, header=first_chunk)

      # Update the first_chunk flag after the first iteration
      first_chunk = False
    except:
      print(f"Error in chunk {i + 1}: {e}")

We explore the sample dataset.

In [6]:
df = pd.read_csv('sample.csv')
df.head(15)

Unnamed: 0,id,score,link_id,author,subreddit,body,created_utc
0,t1_fozlgkc,84.0,t3_ga8hsp,[deleted],BeautyGuruChatter,I felt like the British guy actually asked him...,1588193000.0
1,t1_fw10hrc,2.0,t3_hf5j57,MGDlikethebeer,BeautyGuruChatter,Damn I missed it. I hope someone recorded it s...,1593143000.0
2,t1_gijx7tq,6.0,t3_ksvuzc,uselesssubject,BeautyGuruChatter,Hmmm I was more wondering if it was affiliated...,1610123000.0
3,t1_gi8bmyh,2.0,t3_kppctm,pitolaser,BeautyGuruChatter,Thanks! Will check them out.,1609884000.0
4,t1_fofd5q7,6.0,t3_g6oxmu,forgotmyfuckingname,BeautyGuruChatter,Same here. When I cook/bake recipes that take ...,1587733000.0
5,t1_fvzbhby,22.0,t3_hf5j57,severaldogs,BeautyGuruChatter,I honestly think it’s why he uploaded the demo...,1593110000.0
6,t1_fvwg9vn,3.0,t3_hf5j57,hygsi,BeautyGuruChatter,"I've noticed this too, there have been rumors ...",1593039000.0
7,t1_fvpa78v,64.0,t3_he33fn,OneHappyOne,BeautyGuruChatter,"Yeah, I think that's what a lot of people get ...",1592878000.0
8,t1_fw5dw20,4.0,t3_hgknrq,Cycyvandemoosdijk,BeautyGuruChatter,Hell yeah !!!,1593246000.0
9,t1_fuwcfg9,25.0,t3_h9agyk,pinkglitterydolphins,BeautyGuruChatter,Things are pretty bad in the UK regarding the ...,1592227000.0


We find and remove "deleted" users or "removed" comments and any null values, and calculate the resulting number of rows.

In [7]:
# Garbage count
garbage_rows = (
    df['author'].isin(['', '[deleted]']) | df['author'].isna() |
    df['body'].isin(['', '[removed]']) | df['body'].isna()
)
garbage_count = df[garbage_rows].shape[0]
garbage_count

41253

In [8]:
# Total count
df.shape[0]

407620

In [9]:
# Delete garbage rows from the DataFrame
df = df[~garbage_rows]
df.shape[0]

366367

We group by `author` and concatenate the body together to produce one aggregated string to be fed into the sentiment analyzer.

In [10]:
# Group by 'author' and concatenate 'body' strings
grouped_df = df.groupby('author')['body'].agg(lambda x: ' '.join(x)).reset_index()
grouped_df.head(10)

Unnamed: 0,author,body
0,-----------_---,why red pickaxe honestly i think this is a jok...
1,------__------------,I want computers to get twice as fast every 6 ...
2,-----iMartijn-----,A band is just a company and if you want it to...
3,-----isaac-----,"That’s some of the best Star Wars lore, idk ab..."
4,-----username-----,That’s what this is all about. The UK had to h...
5,----Oumeno----,PB pls make her an LI ill do anything ill pay ...
6,----Richard----,"Yeah, I get you."
7,----The_Truth-----,Your post was removed from r/PennyStocks becau...
8,----yw----,"i know what those are from, but why do they us..."
9,---AL---,The Jack Daniels of churches.


We create function that takes string as text and produces a sentiment number.

In [11]:
def sentiment(text: str) -> float:
  return sid.polarity_scores(text)['compound']

We apply the function to generate the sentiment score for a sample of 10 users, and print the entries out to check whether the generated scores makes sense for each text body. With the exception of `JakeFitzy7` and `proximateprose`, the sentiment score seems to make sense. The sentiment analyzer was able to get 8/10 right, which is better than average (at least for this sample). At first glance, it seems to work better for true positives compared to true negatives.

In [12]:
demo_sample = grouped_df.sample(n=10, random_state=42)
demo_sample['sentiment'] = demo_sample['body'].apply(sentiment)

# Display the sampled DataFrame with sentiment
for index, row in demo_sample.iterrows():
    print(f"Author: {row['author']}\nBody: {row['body']}\nSentiment: {row['sentiment']}\n")

Author: JakeFitzy7
Body: I said *if he is* learn to read you autistic humunculous.
Sentiment: 0.0

Author: Revrun670
Body: Yo who did this? 🤔 Who did this to  Bobby 😂😂😂
Sentiment: 0.0

Author: LightningBoltZolt
Body: I was thinking the same thing, especially with their track of keeping secrets that are plain to see. Thanks for your response!
Sentiment: 0.4926

Author: varjar
Body: > goalrilla

Was always jealous of the kids who had those.
Sentiment: -0.4588

Author: proximateprose
Body: Guys who are usually "decent people" but will do terrible shit when given the opportunity can usually be dissuaded by letting them know you are gathering pieces of data in case they do anything to you (car make, model, license plate; first and last name; as many socials as you can get; etc.), but you do still have to **actually gather** that kind of info, which is really hard to care about and remember to do when manic, sorry.

For the true shitbags, they're so used to deceiving women to get access that