# 1. Setup

In [1]:
# install packages
!pip install langdetect --quiet

In [2]:
# import libraries
import numpy as np
import pandas as pd
import re
from langdetect import detect
from google.colab import drive

In [3]:
# mount Colab to Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# verify data exists in Google Drive dir
!ls 'drive/My Drive/W266'

reddit_database.csv  W266_Final_Project.ipynb  W266_Final_Project_Main.ipynb


# 2. Load Data

In [5]:
# load data
df = pd.read_csv('drive/My Drive/W266/reddit_database.csv')
df.head(3)

Unnamed: 0,created_date,created_timestamp,subreddit,title,id,author,author_created_utc,full_link,score,num_comments,num_crossposts,subreddit_subscribers,post
0,2010-02-10 22:06:17,1265832000.0,analytics,YouTube's traffic data for music questioned,b0ih7,salvage,1184143000.0,https://www.reddit.com/r/analytics/comments/b0...,3.0,0.0,0.0,,
1,2010-02-10 22:06:53,1265832000.0,analytics,November Sees Number of U.S. Videos Viewed Onl...,b0ihf,salvage,1184143000.0,https://www.reddit.com/r/analytics/comments/b0...,1.0,0.0,0.0,,
2,2010-02-11 19:47:22,1265910000.0,analytics,So what do you guys all do related to analytic...,b0x63,xtom,1227476000.0,https://www.reddit.com/r/analytics/comments/b0...,7.0,4.0,0.0,,There's a lot of reasons to want to know all t...


In [6]:
df = df[['title', 'post']]
df.head(5)

Unnamed: 0,title,post
0,YouTube's traffic data for music questioned,
1,November Sees Number of U.S. Videos Viewed Onl...,
2,So what do you guys all do related to analytic...,There's a lot of reasons to want to know all t...
3,10 Web Analytics Tools For Tracking Your Visitors,
4,Improving Your Sense of Site,


In [7]:
#df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545427 entries, 0 to 545426
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   545427 non-null  object
 1   post    274209 non-null  object
dtypes: object(2)
memory usage: 8.3+ MB


# 3. Data Clensing

In [8]:
# drop NaNs from the title and post
df_cleaned = df.dropna(subset=['title', 'post'])
df_cleaned.head(5)

Unnamed: 0,title,post
2,So what do you guys all do related to analytic...,There's a lot of reasons to want to know all t...
5,"Google's Invasive, non-Anonymized Ad Targeting...","I'm cross posting this from /r/cyberlaw, hopef..."
62,"DotCed - Functional Web Analytics - Tagging, R...","DotCed,a Functional Analytics Consultant, offe..."
64,Program Details - Data Analytics Course,Here is the program details of the data analyt...
65,potential job in web analytics... need to anal...,i decided grad school (physics) was not for me...


In [9]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274209 entries, 2 to 545425
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   274209 non-null  object
 1   post    274209 non-null  object
dtypes: object(2)
memory usage: 6.3+ MB


In [10]:
df_nulls = df['post'].isnull()
df_cleaned[df_nulls]

  df_cleaned[df_nulls]


Unnamed: 0,title,post


In [21]:
# English lang detection
# def is_english(text):
#     try:
#         return detect(text) == 'en'
#     except:
#         return False

# text normalization
def text_normalization(text):
  # # check if text is English
  # if not is_english(text):
  #   return np.nan

  # Remove HTML tags
  text = re.sub(r'<.*?>', '', text)
  # Remove URLs
  text = re.sub(r'https?://\S+|www\.\S+', '', text)
  # convert to lowercase
  text = text.lower()
  # remove punctuation
  text = re.sub(r'[^\w\s]', '', text)
  # remove extra spaces
  text = re.sub(r' +', ' ', text.strip())

  return text

# apply text normalization
df_cleaned['post'] = df_cleaned['post'].apply(text_normalization)
df_cleaned['title'] = df_cleaned['title'].apply(text_normalization)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['post'] = df_cleaned['post'].apply(text_normalization)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['title'] = df_cleaned['title'].apply(text_normalization)


In [32]:
df_cleaned.head(10)

Unnamed: 0,title,post
2,so what do you guys all do related to analytic...,theres a lot of reasons to want to know all th...
5,googles invasive nonanonymized ad targeting a ...,im cross posting this from rcyberlaw hopefully...
62,dotced functional web analytics tagging report...,dotceda functional analytics consultant offeri...
64,program details data analytics course,here is the program details of the data analyt...
65,potential job in web analytics need to analyze...,i decided grad school physics was not for me a...
70,what is the need for data analytics,data analytics provides an insight on the inte...
72,webstats generierung von aussagekräftigen zahlen,dieser artikel soll weniger die leistung oder ...
78,analytics mystery where could a massive spike ...,httpiimgurcommav1mpng\nfor no apparent reason ...
87,omnituredfa genesis integration,anyone have any experience with this i have a ...
88,how to identify which google analytics account...,hey all my gf is having trouble with ga and ha...


In [34]:
df_cleaned.tail(10)

Unnamed: 0,title,post
545415,how to scrape multiple values from a function ...,there is this function called mlb_probables in...
545416,plotprint ctree object with expectedpredicted ...,is there a way to plotprint ctree object with ...
545417,could that be a reason for non significant res...,hi i run chisquare between 2 questionnaires or...
545418,how do i convert a timeseries date from a char...,hey guys \n\n\ni want to create an object that...
545419,two linear regression lines through specific d...,hi everyone\n\nfor a university assignment we ...
545421,help interpretting lmer model output,hello i am wonder how the following output wou...
545422,medical stats book with r,can anybody recommend me a book with medical s...
545423,markov chains with unequal sequence lengths,im trying to build a simple markov chain i hav...
545424,view all available rcppplugins,how do i view all available rcppplugins thanks
545425,print only loadings in factanal,hi everybody\n\ni am currently doing a factor ...


In [13]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 274209 entries, 2 to 545425
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   title   274209 non-null  object
 1   post    274209 non-null  object
dtypes: object(2)
memory usage: 6.3+ MB


In [28]:
# inspect NaNs
df_cleaned[df_cleaned.isna().any(axis=1)]

Unnamed: 0,title,post


In [27]:
df_cleaned.to_csv('drive/My Drive/W266/reddit_database_cleaned.csv', index=False)