# Data Pre-processing and Cleaning

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#drive.flush_and_unmount()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 📦 Step 1: Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import numpy as np
import re
import string
import unicodedata

In [None]:
all_data = pd.read_csv('/content/drive/MyDrive/reddit_data.csv')


In [None]:
all_data['created_date'] = all_data['created_date'].astype('datetime64[ns]')
all_data.head()

Unnamed: 0,created_date,created_timestamp,subreddit,title,id,author,author_created_utc,full_link,score,num_comments,num_crossposts,subreddit_subscribers,post
0,2010-02-10 22:06:17,1265832000.0,analytics,YouTube's traffic data for music questioned,b0ih7,salvage,1184143000.0,https://www.reddit.com/r/analytics/comments/b0...,3.0,0.0,0.0,,
1,2010-02-10 22:06:53,1265832000.0,analytics,November Sees Number of U.S. Videos Viewed Onl...,b0ihf,salvage,1184143000.0,https://www.reddit.com/r/analytics/comments/b0...,1.0,0.0,0.0,,
2,2010-02-11 19:47:22,1265910000.0,analytics,So what do you guys all do related to analytic...,b0x63,xtom,1227476000.0,https://www.reddit.com/r/analytics/comments/b0...,7.0,4.0,0.0,,There's a lot of reasons to want to know all t...
3,2010-02-12 18:10:36,1265991000.0,analytics,10 Web Analytics Tools For Tracking Your Visitors,b1bbg,[deleted],,https://www.reddit.com/r/analytics/comments/b1...,4.0,1.0,0.0,,
4,2010-02-26 20:26:18,1267209000.0,analytics,Improving Your Sense of Site,b6x0n,[deleted],,https://www.reddit.com/r/analytics/comments/b6...,2.0,0.0,0.0,,


In [None]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545427 entries, 0 to 545426
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   created_date           545427 non-null  datetime64[ns]
 1   created_timestamp      545427 non-null  float64       
 2   subreddit              545427 non-null  object        
 3   title                  545427 non-null  object        
 4   id                     545427 non-null  object        
 5   author                 545427 non-null  object        
 6   author_created_utc     91985 non-null   float64       
 7   full_link              545427 non-null  object        
 8   score                  545427 non-null  float64       
 9   num_comments           545427 non-null  float64       
 10  num_crossposts         433002 non-null  float64       
 11  subreddit_subscribers  405810 non-null  float64       
 12  post                   274209 non-null  obje

In [None]:
all_data.describe().style.background_gradient(cmap = 'inferno')

Unnamed: 0,created_date,created_timestamp,author_created_utc,score,num_comments,num_crossposts,subreddit_subscribers
count,545427,545427.0,91985.0,545427.0,545427.0,433002.0,405810.0
mean,2019-07-09 11:11:53.170284544,1562661426.950131,1391665057.851878,3.927306,3.749374,0.004566,371764.222422
min,2008-03-19 10:08:43,1205914123.0,1118030400.0,0.0,-1.0,0.0,1.0
25%,2018-02-20 01:19:48,1519082388.0,1343358944.0,1.0,0.0,0.0,48515.25
50%,2020-02-25 02:02:43,1582588963.0,1412598451.0,1.0,1.0,0.0,133797.0
75%,2021-04-14 16:30:12,1618407012.0,1454875996.0,1.0,3.0,0.0,397627.0
max,2022-05-08 19:18:07,1652026687.0,1550407658.0,8331.0,2927.0,23.0,2418635.0
std,,70210808.400223,80527184.409415,24.602288,12.081218,0.091358,562179.294606


In [None]:
def text_cleaner(text):
    """
    Function for clearing text data from unnecessary characters.
    """
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\r', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
tqdm.pandas()
title_cleaned = all_data['title'].progress_apply(lambda x: text_cleaner(x))
title_cleaned

100%|██████████| 545427/545427 [00:12<00:00, 43026.65it/s]


Unnamed: 0,title
0,youtubes traffic data for music questioned
1,november sees number of us videos viewed onlin...
2,so what do you guys all do related to analytic...
3,web analytics tools for tracking your visitors
4,improving your sense of site
...,...
545422,medical stats book with r
545423,markov chains with unequal sequence lengths
545424,view all available rcppplugins
545425,print only loadings in factanal


In [None]:
all_data['title_cleaned'] = title_cleaned


# Sentiment Analyser

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
all_data['sentiment_compound'] = title_cleaned.apply(lambda x: sia.polarity_scores(x)['compound'])
all_data['sentiment_label'] = all_data['sentiment_compound'].apply(
    lambda score: 'positive' if score > 0.05 else ('negative' if score < -0.05 else 'neutral')
)

In [None]:
all_data[['title_cleaned', 'sentiment_compound', 'sentiment_label']].head(10)


Unnamed: 0,title_cleaned,sentiment_compound,sentiment_label
0,youtubes traffic data for music questioned,-0.1027,negative
1,november sees number of us videos viewed onlin...,0.0772,positive
2,so what do you guys all do related to analytic...,0.4588,positive
3,web analytics tools for tracking your visitors,0.0,neutral
4,improving your sense of site,0.4215,positive
5,googles invasive nonanonymized ad targeting a ...,-0.2263,negative
6,million monthly youtube visitors by,0.0,neutral
7,best free web analytics tools for your websites,0.8176,positive
8,twitter now growing at a staggering percent,0.1779,positive
9,google search funnels the greatest assist spec...,0.6808,positive


In [None]:
# Use the cleaned title + selftext as the input
all_data['post'] = title_cleaned + " " + all_data['post'].fillna("")

# Virality Predictor

In [None]:
# Using a higher percentile threshold
viral_threshold = all_data['score'].quantile(0.75)
all_data['is_viral'] = all_data['score'] > viral_threshold


# Topic Trend Predictor

In [None]:
# 3️⃣ Assign dominant topic to each post
all_data['dominant_topic'] = topic_distributions.argmax(axis=1)

In [None]:
all_data['week'] = pd.to_datetime(all_data['created_date']).dt.to_period('W')



In [None]:
all_data['day_of_week'] = all_data['created_date'].dt.dayofweek # 0=Monday, 6=Sunday


In [None]:
all_data['created_date'] = pd.to_datetime(all_data['created_date'], errors='coerce')


# Controversy Predictor

Using a binary classifier

In [None]:
epsilon=1e+6
all_data['controversy_ratio'] = all_data['num_comments'] / (abs(all_data['score']) + 1 + epsilon)

# Download data

In [None]:
all_data.to_csv("reddit_data_processed_time.csv", index=False)


In [None]:
from google.colab import files
files.download("reddit_data_processed_time.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import joblib

joblib.dump(lda, "lda_model.pkl")
files.download("lda_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>