In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn import metrics
import re
import string
from nltk.stem import WordNetLemmatizer
%matplotlib inline 


In [114]:
# Importing Cleaned Submission data from CSV

df = pd.read_csv('netflix_disney_submissions.csv')

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20651 entries, 0 to 20650
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  20651 non-null  int64 
 1   title       20651 non-null  object
 2   subreddit   20651 non-null  object
dtypes: int64(1), object(2)
memory usage: 484.1+ KB


In [116]:
df.head(100)

Unnamed: 0.1,Unnamed: 0,title,subreddit
0,0,Has anyone figured out how to hide the user in...,netflix
1,1,Payment issues?,netflix
2,2,Recommendations?,netflix
3,3,An idea on what Netflix can do about its const...,netflix
4,4,Time Differences in Different Countries,netflix
...,...,...,...
95,95,4K netflix on monitor,netflix
96,96,Will you keep Netflix if they ban password sha...,netflix
97,97,Went to the Store for some Serial,netflix
98,98,Some shows are vanishing even though they're d...,netflix


In [117]:
#Label Target Value   0 for Netflix and 1 DisneyPlus
df['target'] = df['subreddit'].map({'netflix': 0, 'DisneyPlus':1})
df.drop('subreddit', axis=1, inplace=True)
df.head()


Unnamed: 0.1,Unnamed: 0,title,target
0,0,Has anyone figured out how to hide the user in...,0
1,1,Payment issues?,0
2,2,Recommendations?,0
3,3,An idea on what Netflix can do about its const...,0
4,4,Time Differences in Different Countries,0


In [118]:
df.tail()

Unnamed: 0.1,Unnamed: 0,title,target
20646,20646,Audio is really low?,1
20647,20647,That this was pretty interesting,1
20648,20648,Could be the wrong place to ask but…,1
20649,20649,Anyone else surprised Muppet Babies (2018) isn...,1
20650,20650,Continue watching is gone how can I get it back?,1


In [119]:
# Drop duplicates

df.duplicated().sum()

0

In [120]:
# Check Null values
df.isnull().sum()

Unnamed: 0    0
title         0
target        0
dtype: int64

In [121]:
df.shape

(20651, 3)

#### Clean Data

In [155]:
def cleaning(text):
    # Make lowercase
    text = text.lower()

    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    
    # Remove punctuation and split 's, 't, 've with a space for filter
    text = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', text)
    
    # Remove words with 2 or fewer letters
    text = re.sub(r'\b\w{1,2}\b', '', text)
    
    # Remove words with numbers 
    text = re.sub(r'[0-9.+]', '', text)
    
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+', ' ', text)
    
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text = ''.join(c for c in text if c <= '\uFFFF') 
    
    return text

In [156]:
df['title'] = df['title'].apply(cleaning)

In [157]:
df.shape

(20630, 3)

In [158]:
df.head(80)


Unnamed: 0.1,Unnamed: 0,title,target
0,0,anyone figured out how hide the user interfac...,0
1,1,payment issue,0
2,2,recommendation,0
3,3,idea what netflix can about constant cancellin...,0
4,4,time difference different country,0
...,...,...,...
75,75,contestant spill the tea netflix series bullsh,0
76,76,meghan markle created animated series ‘pearl’ ...,0
77,77,cannot activate account without credit card,0
78,78,spoiler ozark finale review that bad,0


In [159]:
df = df[df['title'] != '']
df = df.reset_index(drop=True)

In [160]:
df.shape

(20630, 3)

In [161]:
df.head(80)

Unnamed: 0.1,Unnamed: 0,title,target
0,0,anyone figured out how hide the user interfac...,0
1,1,payment issue,0
2,2,recommendation,0
3,3,idea what netflix can about constant cancellin...,0
4,4,time difference different country,0
...,...,...,...
75,75,contestant spill the tea netflix series bullsh,0
76,76,meghan markle created animated series ‘pearl’ ...,0
77,77,cannot activate account without credit card,0
78,78,spoiler ozark finale review that bad,0


## NLP Pre Processing and Lemmatization

In [162]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    lemma_words =''
    for word in words:
        lemma_words += (lemmatizer.lemmatize(word) + ' ')
    return lemma_words

In [163]:
df['title'] = df['title'].apply(lemmatize_words)

In [164]:
df.shape

(20630, 3)

In [165]:
# drop rows with empty ''
df = df[df['title'] !='']
df = df.reset_index(drop=True)

In [166]:
df.shape

(20630, 3)

In [167]:
df.to_csv('clean_data1.csv', index= False)

## Count Vectorizer

### Netflix Words

In [168]:
#Most Frequently used words

count_vect = CountVectorizer(analyzer="word", tokenizer = None, preprocessor = None, stop_words = "english", ngram_range=(1,1))

# Netflix CountVectorizer 

vector_netflix = df[df['target'] == 0]['title']

# fit_transform the vectorizer

netflix_words = count_vect.fit_transform(vector_netflix)

#Convert output to Numpy Array

netflix_words = netflix_words.toarray()




In [169]:
# Netflix Words

netflix_words_list = count_vect.get_feature_names()

print(netflix_words_list)



### DisneyPlus Words

In [170]:
#Most Frequently used words

count_vect = CountVectorizer(analyzer="word", tokenizer = None, preprocessor = None, stop_words = "english", ngram_range=(1,1))

# Netflix CountVectorizer 

vector_disney = df[df['target'] == 1]['title']

# fit_transform the vectorizer

disney_words = count_vect.fit_transform(vector_netflix)

#Convert output to Numpy Array

disney_words = disney_words.toarray()


In [171]:
disney_words_list = count_vect.get_feature_names()

print(netflix_words_list)



In [172]:
from sklearn.feature_extraction import text

In [176]:
print(text.ENGLISH_STOP_WORDS)
len(text.ENGLISH_STOP_WORDS)

frozenset({'yourselves', 'already', 'between', 'into', 'co', 'whose', 'at', 'although', 'everything', 'if', 'but', 'formerly', 'whenever', 'yours', 'that', 'your', 'detail', 'not', 'over', 'indeed', 'something', 'everywhere', 'her', 'hers', 'nobody', 'meanwhile', 'must', 'ourselves', 'is', 'will', 'due', 'hereafter', 'above', 'are', 'without', 'former', 'can', 'whither', 'throughout', 'after', 'moreover', 'below', 'system', 'where', 'mill', 'ten', 'thick', 'may', 'de', 'else', 'out', 'every', 'sometime', 'even', 'they', 'either', 'whereupon', 'my', 'front', 'both', 'for', 'along', 'seeming', 'am', 'another', 'please', 'nowhere', 'anything', 'etc', 'amoungst', 'since', 'twelve', 'next', 'hasnt', 'than', 'whereafter', 'those', 'also', 'why', 'twenty', 'side', 'thus', 'some', 'less', 'noone', 'several', 'afterwards', 'nine', 'when', 'made', 'him', 'in', 'least', 'this', 'might', 'before', 'own', 'we', 'no', 'further', 'neither', 'ltd', 'inc', 'name', 'ours', 'being', 'down', 'go', 'someho

318

In [177]:
# Using sklearn 318 words to CountVectorizer

stop_words = text.ENGLISH_STOP_WORDS


### Word and N-gram frequency 

#### Netflix

In [190]:
#Most Frequently used words

count_vect = CountVectorizer(analyzer="word", tokenizer = None, preprocessor = None, stop_words = stop_words, ngram_range=(1,3), max_features = 10000)

# Netflix CountVectorizer 

vector_netflix = df[df['target'] == 0]['title']

# fit_transform the vectorizer

netflix_words = count_vect.fit_transform(vector_netflix)

#Convert output to Numpy Array

netflix_words = netflix_words.toarray()

In [191]:
# Frequency

netflix_df = pd.DataFrame(netflix_words, columns= count_vect.get_feature_names())

netflix_df.sum().sort_values(ascending=False).head(100)


netflix               4804
season                 641
series                 635
movie                  465
trailer                374
                      ... 
audio                   99
saf                     99
saf segment             99
ultimatum               99
review episode saf      99
Length: 100, dtype: int64

In [192]:
netflix_df.mean().sort_values(ascending=False).head(50)

netflix                     0.467315
season                      0.062354
series                      0.061770
movie                       0.045233
trailer                     0.036381
official                    0.035311
new                         0.031226
watching                    0.031128
subscriber                  0.031031
review                      0.029961
watch                       0.029961
just                        0.026848
episode                     0.026848
official trailer            0.026751
like                        0.025681
account                     0.022471
anime                       0.021401
thing                       0.021401
password                    0.021401
time                        0.020428
good                        0.019455
documentary                 0.019455
sharing                     0.019261
trailer netflix             0.019261
know                        0.019261
subtitle                    0.019261
recommendation              0.018580
w

#### DisneyPlus

In [193]:
#Most Frequently used words

count_vect = CountVectorizer(analyzer="word", tokenizer = None, preprocessor = None, stop_words = stop_words, ngram_range=(1,3), max_features = 10000)

# DisneyPlus CountVectorizer 

vector_disney = df[df['target'] == 1]['title']

# fit_transform the vectorizer

disney_words = count_vect.fit_transform(vector_disney)

#Convert output to Numpy Array

disney_words = disney_words.toarray()

In [194]:
# Frequency

disney_df = pd.DataFrame(disney_words, columns= count_vect.get_feature_names())

disney_df.sum().sort_values(ascending=False).head(100)

disney         4910
series         1024
plus           1008
disney plus     987
marvel          809
               ... 
week            116
premiere        116
guy             116
way             116
australia       115
Length: 100, dtype: int64

In [195]:
# Proportion

disney_df.mean().sort_values(ascending=False).head(50)

disney           0.474396
series           0.098937
plus             0.097391
disney plus      0.095362
marvel           0.078164
star             0.075749
new              0.062415
episode          0.062029
movie            0.060966
moon             0.057488
coming           0.056039
season           0.054976
moon knight      0.054300
knight           0.054300
doe              0.040580
streaming        0.040193
content          0.038841
netflix          0.037778
watch            0.034686
available        0.033527
release          0.033237
think            0.031111
original         0.029372
march            0.028019
poster           0.028019
coming disney    0.026860
family           0.026280
hulu             0.026184
film             0.026184
canada           0.024058
know             0.022222
just             0.022222
time             0.021449
soon             0.020966
year             0.019517
disney star      0.019420
red              0.018744
missing          0.018551
join        

## TF_IDF Vectorizer

### Netflix 


In [196]:
tvec = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3))

netflix_tf_words = tvec.fit_transform(vector_netflix)

netflix_tf_words = netflix_tf_words.toarray()

netflix_df = pd.DataFrame(netflix_tf_words, columns=tvec.get_feature_names())

netflix_df.sum().sort_values(ascending=False).head(50)

netflix                     378.025063
season                       99.482846
series                       97.587473
recommendation               87.766553
movie                        85.649475
trailer                      70.249396
watch                        70.199291
official                     66.303589
watching                     63.445958
review                       60.239044
subscriber                   57.043065
official trailer             55.465437
new                          54.961988
episode                      51.493682
just                         50.803292
cancelled                    49.012463
good                         47.948142
old                          47.202901
anime                        46.173349
subtitle                     45.892470
account                      45.413614
day                          44.598677
looking                      44.172312
trailer netflix              42.979593
watched                      42.672420
help                     

### DisneyPlus

In [197]:
tvec = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3))

disney_tf_words = tvec.fit_transform(vector_disney)

disney_tf_words = disney_tf_words.toarray()

disney_df = pd.DataFrame(disney_tf_words, columns=tvec.get_feature_names())

disney_df.sum().sort_values(ascending=False).head(50)

disney           371.869755
plus             142.684633
disney plus      141.622847
marvel           129.874829
series           118.945798
new              100.133077
star              98.545804
coming            98.380048
movie             97.952807
episode           97.763845
moon              90.514967
season            90.185197
knight            86.908945
moon knight       86.908945
netflix           71.927276
coming disney     69.523522
content           67.050280
daredevil         66.272463
doe               64.359542
streaming         62.289071
watch             59.066736
canada            55.624998
march             55.145715
poster            53.445131
available         52.351057
think             49.507922
release           48.791174
family            46.014258
missing           45.565233
april             45.056824
original          42.822600
year              41.124481
house             39.965069
film              39.558762
help              39.549984
know              38

In [None]:
# Import library for generating word cloud
from wordcloud import WordCloud
from PIL import Image

# Define function to draw word cloud
def draw_wordcloud(text, plot_title=None, mask_img=None):
    """
    Generate and draw word cloud from text.
    
    Parameters
    ----------
    text : str
        Text to generate word cloud from.
    title : str
        Title to set on plot
    mask_img: str
        Path to image mask
    """
    mask = np.array(Image.open(mask_img))

    wordcloud = WordCloud(
        mask = mask,
        scale=7,
        max_words=1000,
        background_color='white',
        colormap='brg',
        contour_width=3,
        contour_color='steelblue',
        random_state=42
    ).generate(text)
    plt.figure(figsize=(14,14), facecolor='white')
    plt.imshow(wordcloud, interpolation="bilinear")
    if plot_title:
        plt.title(plot_title, fontsize=20, pad=50)
    plt.axis("off")
    plt.show()

In [None]:
# Draw word cloud for /r/personalfinance
draw_wordcloud(
    str(df[df.subreddit=='personalfinance'].text_feature),
    plot_title='Word Cloud for /r/personalfinance',
    mask_img = '../images/money_mask.png'
)

In [None]:
# Draw word cloud for /r/relationship_advice
draw_wordcloud(
    str(df[df.subreddit=='relationship_advice'].text_feature),
    plot_title='Word Cloud for /r/relationship_advice',
    mask_img = '../images/heart_mask.png'
)