In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
%matplotlib inline
import itertools
from wordcloud import WordCloud, STOPWORDS

from src.features.preprocess import PreProcess

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

import re
from gensim import corpora, models
import gensim

[nltk_data] Downloading package punkt to /home/andy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/andy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/andy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/andy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Read the posts csv file

In [2]:
subreddit = "computerscience"

In [3]:
df_posts = pd.read_csv("../data/raw/" + subreddit + "_posts.csv")
df_posts.head()

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
0,n2n0ax,New to programming or computer science? Want a...,375,1.0,computerscience,https://www.reddit.com/r/computerscience/comme...,1037,The previous thread was finally archived with ...,1619890000.0
1,qb4bof,THIS IS NOT A TECH SUPPORT SUB OR A COMPUTER R...,404,0.97,computerscience,https://www.reddit.com/r/computerscience/comme...,54,Tech Support: /r/techsupport\n\nComputer Recom...,1634619000.0
2,s8afrp,Ways to learn higher level math through CS?,12,0.88,computerscience,https://www.reddit.com/r/computerscience/comme...,3,"I'm a university student, and while I've taken...",1642654000.0
3,s7yrsa,Why is there nothing between 8bit and 16bit?,23,0.82,computerscience,https://www.reddit.com/r/computerscience/comme...,19,"For example, if i want to change the image spe...",1642622000.0
4,s7li8v,Why is Internet/transmission speed in Bit and ...,59,0.95,computerscience,https://www.reddit.com/r/computerscience/comme...,41,I am always wondering about this. Why Internet...,1642581000.0


In [4]:
def filter_urls(s):
    if isinstance(s, str):
        return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', s, flags=re.MULTILINE)
    else:
        return s

df_posts['body'] = df_posts['body'].apply(filter_urls)

### Find body texts that have http included

In [5]:
df_posts[~(df_posts['body'].str.find('http') == -1)]

Unnamed: 0,post_id,title,score,upvote_ratio,subreddit,url,num_comments,body,created
17,s75dnu,How Kubernetes Protects Enterprises From Ranso...,0,0.50,computerscience,https://containerjournal.com/editorial-calenda...,0,,1.642534e+09
18,s6v29r,Good resources to learn more about authenticat...,3,1.00,computerscience,/r/webdev/comments/s6ev5f/good_resources_to_le...,0,,1.642504e+09
21,s6mxkp,Need some Cache Memory analogies…preferably to...,8,0.83,computerscience,https://www.reddit.com/r/computerscience/comme...,3,,1.642475e+09
24,s6ijx1,"""The early days of Unix at Bell Labs"" - Brian ...",6,0.87,computerscience,/r/unix/comments/s6iiyh/the_early_days_of_unix...,0,,1.642463e+09
31,s57kne,Making Your Game Go Fast by Asking Windows Nicely,40,0.85,computerscience,https://www.anthropicstudios.com/2022/01/13/as...,6,,1.642323e+09
...,...,...,...,...,...,...,...,...,...
477,q1txed,“My phone is listening in on my conversations”...,90,0.97,computerscience,https://twitter.com/JL_Kroger/status/144533314...,6,,1.633431e+09
484,q0j4vh,Yann LeCun's Paper Gets Rejected From NeurIPS ...,62,0.90,computerscience,https://www.theclickreader.com/yann-lecun-pape...,17,,1.633272e+09
487,q0ta23,difference in hard vs soft transfer function (AI),1,1.00,computerscience,https://www.reddit.com/r/computerscience/comme...,1,,1.633303e+09
489,oyunkm,Built a computer from scratch. A Z80 running a...,997,0.99,computerscience,https://v.redd.it/qti2kz1rjmf71,82,,1.628206e+09


In [6]:
df_posts[~(df_posts['body'].str.find('www') == -1)].body.unique()

array([nan], dtype=object)

In [7]:
df_posts[~(df_posts['body'].str.find('http') == -1)].body.unique()

array([nan,
       '**So for the context:**\n\nIf i\'m assuming right the "n" stands for input or size of the array and when one says O(n) it means the statment will take "**n"** number of times to complete it....\n\nbut what does "**k"** mean?\n\n[here]() op said this ⬇️\n\n&#x200B;\n\n[https:\\/\\/stackoverflow.com\\/q\\/27301287\\/5630533]()\n\nDoes that mean "**k"** is just an another (secondary) notation for "**n"**\n\n&#x200B;\n\n&#x200B;\n\n[sorry, couldn\'t find the original link]()\n\n&#x200B;\n\nReferencing the above table, how much would be the difference between cubic and polynomial if n is 10 and k is 0(can we even assume 0 for "**k**" value)?'],
      dtype=object)

In [8]:
# Index 309's body has a very unique URL that can't be filtered out
df_posts.iloc[309]['body']

'**So for the context:**\n\nIf i\'m assuming right the "n" stands for input or size of the array and when one says O(n) it means the statment will take "**n"** number of times to complete it....\n\nbut what does "**k"** mean?\n\n[here]() op said this ⬇️\n\n&#x200B;\n\n[https:\\/\\/stackoverflow.com\\/q\\/27301287\\/5630533]()\n\nDoes that mean "**k"** is just an another (secondary) notation for "**n"**\n\n&#x200B;\n\n&#x200B;\n\n[sorry, couldn\'t find the original link]()\n\n&#x200B;\n\nReferencing the above table, how much would be the difference between cubic and polynomial if n is 10 and k is 0(can we even assume 0 for "**k**" value)?'