# Cleaning the dataset even further

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from nltk.corpus import stopwords 
import regex as re
from bs4 import BeautifulSoup  

%matplotlib inline

In [70]:
df_reddit = pd.read_csv('Datasets/reddit_list_with_feature_engineering.csv')

In [71]:
df_reddit.head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit,char_count_title,word_count_title,char_count_selftext,word_count_selftext,title + selftext
0,nothanksbud5,g5rffm,1,2,1587516214,Wow i didn’t realize how much music is about b...,Why is almost all music seem to be about love?,0,46,10,219,36,Why is almost all music seem to be about love?...
1,dontknowwhattdo,g5r7z2,3,2,1587515419,I thought that during this time it would be ni...,pieces of advice that have stuck with you?,0,42,8,285,55,pieces of advice that have stuck with you? I t...
2,sharkfinnsouphk,g5r5q2,2,0,1587515173,I just can't shake this worry about kids (and ...,Worried about people stuck at home,0,34,6,269,50,Worried about people stuck at home I just can'...
3,dehlen1me,g5r3t3,0,1,1587514972,https://youtu.be/9_AWrNmcMZA\nThis is one of t...,How a 5 Dollar bill can help you to feel bette...,0,62,13,179,24,How a 5 Dollar bill can help you to feel bette...
4,fighterpilot909,g5qtjo,2,2,1587513886,Imagine how insane that book would be. To make...,I want an autobiography from John McAfee so badly,0,49,9,144,28,I want an autobiography from John McAfee so ba...


## Cleaning up the data a little more

### Trying it out on the titles first

In [72]:
# This function has been taken and adapted from Matt Brems NLP I lecture.
def reddit_to_words(raw_text):
    # Function to convert a raw reddit post to a string of words
    
    # 1. Remove HTML.
    text = BeautifulSoup(raw_text).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. Searching a set to remove the chance of duplicates
    stops = set(stopwords.words('english'))
    
    # 5. Remove stopwords.
    meaningful_words = [w for w in words if w not in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [73]:
total_titles = df_reddit.shape[0]

In [74]:
# Adapted from NLP I lesson by Matt Brems
# Initialize an empty list to hold the clean titles.
clean_train_titles = []
clean_test_titles = []

print("Cleaning and parsing the training set subreddit titles...")

# Instantiate counter.
j = 0

# For every title in our training set...
for train_titles in X_train['title']:
    
    # Convert title to words, then append to clean_train_titles.
    clean_train_titles.append(reddit_to_words(train_titles))
    
    # If the index is divisible by 1000, print a message.
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
    
    j += 1

# Let's do the same for our testing set.
print("Cleaning and parsing the testing set of subreddit titles...")

# For every title in our testing set...
for test_titles in X_test['title']:
    
    # Convert title to words, then append to clean_train_reviews.
    clean_test_titles.append(reddit_to_words(test_titles))
    
    # If the index is divisible by 1000, print a message.
    if (j + 1) % 1000 == 0:
        print(f'Title {j + 1} of {total_titles}.')
        
    j += 1

Cleaning and parsing the training set subreddit titles...
Review 1000 of 21825.
Review 2000 of 21825.
Review 3000 of 21825.
Review 4000 of 21825.
Review 5000 of 21825.
Review 6000 of 21825.
Review 7000 of 21825.
Review 8000 of 21825.
Review 9000 of 21825.
Review 10000 of 21825.
Review 11000 of 21825.
Review 12000 of 21825.
Review 13000 of 21825.
Review 14000 of 21825.


  ' that document to Beautiful Soup.' % decoded_markup


Review 15000 of 21825.
Review 16000 of 21825.
Cleaning and parsing the testing set of subreddit titles...


  ' that document to Beautiful Soup.' % decoded_markup


Title 17000 of 21825.
Title 18000 of 21825.
Title 19000 of 21825.
Title 20000 of 21825.
Title 21000 of 21825.


In [75]:
df = pd.DataFrame(clean_train_titles)

In [76]:
# Adapted from NLP I lesson by Matt Brems
# Initialize an empty list to hold the clean titles.
clean_titles = []

print("Cleaning and parsing the training set subreddit titles...")

# Instantiate counter.
j = 0

# For every title in our training set...
for train_titles in df_reddit['title']:
    
    # Convert title to words, then append to clean_train_titles.
    clean_titles.append(reddit_to_words(train_titles))
    
    # If the index is divisible by 1000, print a message.
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
    
    j += 1

Cleaning and parsing the training set subreddit titles...
Review 1000 of 21825.
Review 2000 of 21825.
Review 3000 of 21825.
Review 4000 of 21825.
Review 5000 of 21825.
Review 6000 of 21825.
Review 7000 of 21825.
Review 8000 of 21825.
Review 9000 of 21825.
Review 10000 of 21825.
Review 11000 of 21825.
Review 12000 of 21825.
Review 13000 of 21825.
Review 14000 of 21825.
Review 15000 of 21825.
Review 16000 of 21825.
Review 17000 of 21825.
Review 18000 of 21825.
Review 19000 of 21825.
Review 20000 of 21825.
Review 21000 of 21825.


In [77]:
clean_title_df = pd.DataFrame(clean_titles, columns=['clean_title'])

In [78]:
clean_title_df

Unnamed: 0,clean_title
0,almost music seem love
1,pieces advice stuck
2,worried people stuck home
3,dollar bill help feel better
4,want autobiography john mcafee badly
...,...
21820,normal able pick attractive feature ones face
21821,today best worst day ever
21822,taunting hard losing game
21823,gone far


In [79]:
df_reddit = pd.concat([df_reddit, clean_title_df], axis=1)

In [80]:
# Adapted from NLP I lesson by Matt Brems
# Initialize an empty list to hold the clean selftext.
clean_selftext = []


print("Cleaning and parsing the training set subreddit titles...")

# Instantiate counter.
j = 0

# For every title in our training set...
for selftext in df_reddit['selftext']:
    
    # Convert title to words, then append to clean_train_titles.
    clean_selftext.append(reddit_to_words(selftext))
    
    # If the index is divisible by 1000, print a message.
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
    
    j += 1

Cleaning and parsing the training set subreddit titles...
Review 1000 of 21825.
Review 2000 of 21825.
Review 3000 of 21825.
Review 4000 of 21825.
Review 5000 of 21825.
Review 6000 of 21825.
Review 7000 of 21825.
Review 8000 of 21825.


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


Review 9000 of 21825.
Review 10000 of 21825.
Review 11000 of 21825.
Review 12000 of 21825.
Review 13000 of 21825.
Review 14000 of 21825.
Review 15000 of 21825.
Review 16000 of 21825.
Review 17000 of 21825.
Review 18000 of 21825.


  ' that document to Beautiful Soup.' % decoded_markup


Review 19000 of 21825.
Review 20000 of 21825.


  ' that document to Beautiful Soup.' % decoded_markup


Review 21000 of 21825.


In [81]:
clean_selftext_df = pd.DataFrame(clean_selftext, columns=['clean_selftext'])

df_reddit = pd.concat([df_reddit, clean_selftext_df], axis=1)

In [82]:
df_reddit.head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit,char_count_title,word_count_title,char_count_selftext,word_count_selftext,title + selftext,clean_title,clean_selftext
0,nothanksbud5,g5rffm,1,2,1587516214,Wow i didn’t realize how much music is about b...,Why is almost all music seem to be about love?,0,46,10,219,36,Why is almost all music seem to be about love?...,almost music seem love,wow realize much music love romance seems like...
1,dontknowwhattdo,g5r7z2,3,2,1587515419,I thought that during this time it would be ni...,pieces of advice that have stuck with you?,0,42,8,285,55,pieces of advice that have stuck with you? I t...,pieces advice stuck,thought time would nice hear words encourageme...
2,sharkfinnsouphk,g5r5q2,2,0,1587515173,I just can't shake this worry about kids (and ...,Worried about people stuck at home,0,34,6,269,50,Worried about people stuck at home I just can'...,worried people stuck home,shake worry kids adults stuck home lock sexual...
3,dehlen1me,g5r3t3,0,1,1587514972,https://youtu.be/9_AWrNmcMZA\nThis is one of t...,How a 5 Dollar bill can help you to feel bette...,0,62,13,179,24,How a 5 Dollar bill can help you to feel bette...,dollar bill help feel better,https youtu awrnmcmza one amazing uplifting vi...
4,fighterpilot909,g5qtjo,2,2,1587513886,Imagine how insane that book would be. To make...,I want an autobiography from John McAfee so badly,0,49,9,144,28,I want an autobiography from John McAfee so ba...,want autobiography john mcafee badly,imagine insane book would make even better cou...


In [83]:
# Adapted from NLP I lesson by Matt Brems
# Initialize an empty list to hold the clean feature engineered column 'title + selftext'.
clean_title_and_selftext = []


print("Cleaning and parsing the training set subreddit titles...")

# Instantiate counter.
j = 0

# For every title in our training set...
for title_and_selftext in df_reddit['title + selftext']:
    
    # Convert title to words, then append to clean_train_titles.
    clean_title_and_selftext.append(reddit_to_words(title_and_selftext))
    
    # If the index is divisible by 1000, print a message.
    if (j + 1) % 1000 == 0:
        print(f'Review {j + 1} of {total_titles}.')
    
    j += 1

Cleaning and parsing the training set subreddit titles...
Review 1000 of 21825.
Review 2000 of 21825.
Review 3000 of 21825.
Review 4000 of 21825.
Review 5000 of 21825.
Review 6000 of 21825.
Review 7000 of 21825.
Review 8000 of 21825.
Review 9000 of 21825.
Review 10000 of 21825.
Review 11000 of 21825.
Review 12000 of 21825.
Review 13000 of 21825.
Review 14000 of 21825.
Review 15000 of 21825.
Review 16000 of 21825.
Review 17000 of 21825.
Review 18000 of 21825.
Review 19000 of 21825.
Review 20000 of 21825.
Review 21000 of 21825.


In [84]:
clean_title_and_selftext_df = pd.DataFrame(clean_title_and_selftext, columns=['clean_title_+_selftext'])

df_reddit = pd.concat([df_reddit, clean_title_and_selftext_df], axis=1)

In [85]:
df_reddit.head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit,char_count_title,word_count_title,char_count_selftext,word_count_selftext,title + selftext,clean_title,clean_selftext,clean_title_+_selftext
0,nothanksbud5,g5rffm,1,2,1587516214,Wow i didn’t realize how much music is about b...,Why is almost all music seem to be about love?,0,46,10,219,36,Why is almost all music seem to be about love?...,almost music seem love,wow realize much music love romance seems like...,almost music seem love wow realize much music ...
1,dontknowwhattdo,g5r7z2,3,2,1587515419,I thought that during this time it would be ni...,pieces of advice that have stuck with you?,0,42,8,285,55,pieces of advice that have stuck with you? I t...,pieces advice stuck,thought time would nice hear words encourageme...,pieces advice stuck thought time would nice he...
2,sharkfinnsouphk,g5r5q2,2,0,1587515173,I just can't shake this worry about kids (and ...,Worried about people stuck at home,0,34,6,269,50,Worried about people stuck at home I just can'...,worried people stuck home,shake worry kids adults stuck home lock sexual...,worried people stuck home shake worry kids adu...
3,dehlen1me,g5r3t3,0,1,1587514972,https://youtu.be/9_AWrNmcMZA\nThis is one of t...,How a 5 Dollar bill can help you to feel bette...,0,62,13,179,24,How a 5 Dollar bill can help you to feel bette...,dollar bill help feel better,https youtu awrnmcmza one amazing uplifting vi...,dollar bill help feel better https youtu awrnm...
4,fighterpilot909,g5qtjo,2,2,1587513886,Imagine how insane that book would be. To make...,I want an autobiography from John McAfee so badly,0,49,9,144,28,I want an autobiography from John McAfee so ba...,want autobiography john mcafee badly,imagine insane book would make even better cou...,want autobiography john mcafee badly imagine i...


In [90]:
df_reddit.isnull().sum()

author                    0
id                        0
num_comments              0
score                     0
created_utc               0
selftext                  0
title                     0
subreddit                 0
char_count_title          0
word_count_title          0
char_count_selftext       0
word_count_selftext       0
title + selftext          0
clean_title               0
clean_selftext            0
clean_title_+_selftext    0
dtype: int64

In [91]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [94]:
# From the Sentiment Analysis Local Lab Lesson
sia = SentimentIntensityAnalyzer()

for score in ['neg', 'neu', 'pos', 'compound']:
    df_reddit[score] = df_reddit['clean_title_+_selftext'].apply(lambda text: sia.polarity_scores(text)[score])

df_reddit.head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit,char_count_title,word_count_title,char_count_selftext,word_count_selftext,title + selftext,clean_title,clean_selftext,clean_title_+_selftext,neg,neu,pos,compound
0,nothanksbud5,g5rffm,1,2,1587516214,Wow i didn’t realize how much music is about b...,Why is almost all music seem to be about love?,0,46,10,219,36,Why is almost all music seem to be about love?...,almost music seem love,wow realize much music love romance seems like...,almost music seem love wow realize much music ...,0.08,0.301,0.619,0.9777
1,dontknowwhattdo,g5r7z2,3,2,1587515419,I thought that during this time it would be ni...,pieces of advice that have stuck with you?,0,42,8,285,55,pieces of advice that have stuck with you? I t...,pieces advice stuck,thought time would nice hear words encourageme...,pieces advice stuck thought time would nice he...,0.091,0.383,0.526,0.9657
2,sharkfinnsouphk,g5r5q2,2,0,1587515173,I just can't shake this worry about kids (and ...,Worried about people stuck at home,0,34,6,269,50,Worried about people stuck at home I just can'...,worried people stuck home,shake worry kids adults stuck home lock sexual...,worried people stuck home shake worry kids adu...,0.467,0.456,0.077,-0.8924
3,dehlen1me,g5r3t3,0,1,1587514972,https://youtu.be/9_AWrNmcMZA\nThis is one of t...,How a 5 Dollar bill can help you to feel bette...,0,62,13,179,24,How a 5 Dollar bill can help you to feel bette...,dollar bill help feel better,https youtu awrnmcmza one amazing uplifting vi...,dollar bill help feel better https youtu awrnm...,0.0,0.63,0.37,0.8555
4,fighterpilot909,g5qtjo,2,2,1587513886,Imagine how insane that book would be. To make...,I want an autobiography from John McAfee so badly,0,49,9,144,28,I want an autobiography from John McAfee so ba...,want autobiography john mcafee badly,imagine insane book would make even better cou...,want autobiography john mcafee badly imagine i...,0.215,0.63,0.156,-0.3818


In [95]:
df_reddit.shape

(21825, 20)

In [98]:
df_reddit.head()

Unnamed: 0,author,id,num_comments,score,created_utc,selftext,title,subreddit,char_count_title,word_count_title,char_count_selftext,word_count_selftext,title + selftext,clean_title,clean_selftext,clean_title_+_selftext,neg,neu,pos,compound
0,nothanksbud5,g5rffm,1,2,1587516214,Wow i didn’t realize how much music is about b...,Why is almost all music seem to be about love?,0,46,10,219,36,Why is almost all music seem to be about love?...,almost music seem love,wow realize much music love romance seems like...,almost music seem love wow realize much music ...,0.08,0.301,0.619,0.9777
1,dontknowwhattdo,g5r7z2,3,2,1587515419,I thought that during this time it would be ni...,pieces of advice that have stuck with you?,0,42,8,285,55,pieces of advice that have stuck with you? I t...,pieces advice stuck,thought time would nice hear words encourageme...,pieces advice stuck thought time would nice he...,0.091,0.383,0.526,0.9657
2,sharkfinnsouphk,g5r5q2,2,0,1587515173,I just can't shake this worry about kids (and ...,Worried about people stuck at home,0,34,6,269,50,Worried about people stuck at home I just can'...,worried people stuck home,shake worry kids adults stuck home lock sexual...,worried people stuck home shake worry kids adu...,0.467,0.456,0.077,-0.8924
3,dehlen1me,g5r3t3,0,1,1587514972,https://youtu.be/9_AWrNmcMZA\nThis is one of t...,How a 5 Dollar bill can help you to feel bette...,0,62,13,179,24,How a 5 Dollar bill can help you to feel bette...,dollar bill help feel better,https youtu awrnmcmza one amazing uplifting vi...,dollar bill help feel better https youtu awrnm...,0.0,0.63,0.37,0.8555
4,fighterpilot909,g5qtjo,2,2,1587513886,Imagine how insane that book would be. To make...,I want an autobiography from John McAfee so badly,0,49,9,144,28,I want an autobiography from John McAfee so ba...,want autobiography john mcafee badly,imagine insane book would make even better cou...,want autobiography john mcafee badly imagine i...,0.215,0.63,0.156,-0.3818


In [100]:
df_reddit.to_csv('Datasets/reddit_cleaned_title_and_selftext.csv', index=False)