# Data Preparation / Descriptive Statistics Notebook
## ADS 509 Final Project

## Import Packages

In [108]:
#### UTILITIES
import os
import pandas as pd
import numpy as np
import random
from collections import Counter

#### PREPROCESSING
import nltk
import regex as re
import emoji
import string
from nltk.corpus import stopwords
from string import punctuation
from emoji import is_emoji

#### CLASSIFIERS
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC


#### TOPIC MODELING
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

## Function to Import .csv to DataFrame

In [34]:
def follower_import(filenum):
    svs = pd.DataFrame()
    raw_file = os.listdir('Data')[filenum] # changed to 'Data' folder
    file = ('Data/'+ raw_file) # changed to 'Data' folder
    print(file)
    
    svs = pd.read_csv(file, header=0)
    svs['review'] = svs['review'].apply(str)
    svs['title'] = svs['title'].apply(str)
    svs['app_name'] = svs['app_name'].apply(str)
    
    svs.drop(svs[svs['review'] == 'nan'].index, inplace = True)
    svs.drop(svs[svs['review'] == ''].index, inplace = True)
    svs.drop(svs[svs['title'] == 'nan'].index, inplace = True)
    svs.drop(svs[svs['title'] == ''].index, inplace = True)
    svs.drop(svs[svs['app_name'] == 'nan'].index, inplace = True)
    svs.drop(svs[svs['app_name'] == ''].index, inplace = True)
    
    #svs['desc_len'] = svs['desc'].str.len()
    return svs

print(os.listdir('Data'))  # changed to 'Data' folder

['disney.csv', 'hbo-max-stream-tv-movies.csv', 'hulu-stream-shows-movies.csv', 'netflix.csv', 'youtube-watch-listen-stream.csv']


## Import disney

In [35]:
disney_dl = follower_import(0)
disney = disney_dl[['app_name', 'title', 'review', 'rating']]
disney.describe().T
print(disney.info())
disney.head(3)

Data/disney.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1008 entries, 0 to 1007
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  1008 non-null   object
 1   title     1008 non-null   object
 2   review    1008 non-null   object
 3   rating    1008 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 31.6+ KB
None


Unnamed: 0,app_name,title,review,rating
0,disney,MISSING many shows (and languages),I grew up watching many a Disney shows. Signed...,3
1,disney,really quite upset,pretty sure this was deliberate. my old iphone...,1
2,disney,Probably the worst streaming app.,We have just about every streaming app you can...,3


## Import hbomax

In [36]:
hbomax_dl = follower_import(1)
hbomax = hbomax_dl[['app_name', 'title', 'review', 'rating']]
hbomax.describe().T
print(hbomax.info())
hbomax.head(3)

Data/hbo-max-stream-tv-movies.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  1001 non-null   object
 1   title     1001 non-null   object
 2   review    1001 non-null   object
 3   rating    1001 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 31.4+ KB
None


Unnamed: 0,app_name,title,review,rating
0,hbo-max-stream-tv-movies,HBO Enforces Mandatory Letterboxing for “CoNsI...,EDIT: Freedom to watch content in preferred as...,5
1,hbo-max-stream-tv-movies,Great movies terrible app,I love the amount of shows and movies HBO Max ...,2
2,hbo-max-stream-tv-movies,"Buggy, slow, poorly designed, screen-waster","Got a decent size iPhone? Good thing, cause th...",2


## Import hulu

In [37]:
hulu_dl = follower_import(2)
hulu = hulu_dl[['app_name', 'title', 'review', 'rating']]
hulu.describe().T
print(hulu.info())
hulu.head(3)

Data/hulu-stream-shows-movies.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  1000 non-null   object
 1   title     1000 non-null   object
 2   review    1000 non-null   object
 3   rating    1000 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 31.4+ KB
None


Unnamed: 0,app_name,title,review,rating
0,hulu-stream-shows-movies,"Magnificent, better than tinder",I haven’t felt more alive in YEARS!! My wife l...,5
1,hulu-stream-shows-movies,Ughhh,When you turn your phone upright you see your ...,3
2,hulu-stream-shows-movies,Good but casting is flawed,I’ve noticed some major flaws when casting sho...,2


## Import netflix

In [38]:
netflix_dl = follower_import(3)
netflix = netflix_dl[['app_name', 'title', 'review', 'rating']]
netflix.describe().T
print(netflix.info())
netflix.head(3)

Data/netflix.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1004 entries, 0 to 1003
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  1004 non-null   object
 1   title     1004 non-null   object
 2   review    1004 non-null   object
 3   rating    1004 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 31.5+ KB
None


Unnamed: 0,app_name,title,review,rating
0,netflix,Great app,I’ve been using Netflix for 5 years and now I\...,4
1,netflix,Its amazing but needs to be better,I’ve been using Netflix for 5 years and now I'...,4
2,netflix,Trash,I use a VPN to protect my privacy and the app ...,1


## Import youtube

In [39]:
youtube_dl = follower_import(4)
youtube = youtube_dl[['app_name', 'title', 'review', 'rating']]
youtube.describe().T
print(youtube.info())
youtube.head(3)

Data/youtube-watch-listen-stream.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1019 entries, 0 to 1018
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  1019 non-null   object
 1   title     1019 non-null   object
 2   review    1019 non-null   object
 3   rating    1019 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 32.0+ KB
None


Unnamed: 0,app_name,title,review,rating
0,youtube-watch-listen-stream,Great review thank you,Well explained I’ve been using all the topaz l...,5
1,youtube-watch-listen-stream,Same videos keep showing up in loop and made f...,"Update (October 1,2022): the issue with the ol...",4
2,youtube-watch-listen-stream,"Comments, ads, restrictions.",If the video goes into an ad then I lose my sp...,1


In [40]:
df = pd.concat([disney, hbomax, hulu, netflix, youtube], axis = 0, ignore_index = True)
df = df.replace({'app_name': {'hbo-max-stream-tv-movies': 'hbomax',
                              'hulu-stream-shows-movies': 'hulu',
                              'youtube-watch-listen-stream': 'youtube'}})
df

Unnamed: 0,app_name,title,review,rating
0,disney,MISSING many shows (and languages),I grew up watching many a Disney shows. Signed...,3
1,disney,really quite upset,pretty sure this was deliberate. my old iphone...,1
2,disney,Probably the worst streaming app.,We have just about every streaming app you can...,3
3,disney,Love it but could add more stuff,I live Disney! I grew up with it my whole life...,4
4,disney,Frequently causes issues with connectivity,App works great through non-iOS devices but lo...,2
...,...,...,...,...
5027,youtube,Wonderful,"By one concern at this time, once you open you...",4
5028,youtube,Easy on the ads !!!!,Take it easy on the ads Google ! I get it that...,1
5029,youtube,You tube outstanding in information dessimination,You tube is outstanding in bringing out idiffe...,5
5030,youtube,It is great for maybe you but for me.,I live somewhere close to the mountains in Cov...,5


## Choose Stopword List(s)

In [41]:
sw = []
sw.extend(stopwords.words(["english"]))
sw.extend(("disney", "disney+", "disneyplus", "plus", "hbomax", "hbo", "max", "hulu", "netflix", "youtube", "tube"))

## Functions to Preprocess Cells into Tokens

In [42]:
def only_words(x):
    x = re.sub('[^a-z\s]', '', x.lower())
    x = [w for w in x.split() if w not in set(sw)]
    # USE THIS ONE FOR TOKENS
    return x
    # USE THIS ONE FOR STRING
    #return ' '.join(x)
    
# Write function to leave hashtags, emojis, etc

## Process Text Columns into New Token Columns

In [43]:
df['review_onlywords'] = df['review'].apply(only_words)
df['title_onlywords'] = df['title'].apply(only_words)
df.tail(5)

Unnamed: 0,app_name,title,review,rating,review_onlywords,title_onlywords
5027,youtube,Wonderful,"By one concern at this time, once you open you...",4,"[one, concern, time, open, queue, see, somethi...",[wonderful]
5028,youtube,Easy on the ads !!!!,Take it easy on the ads Google ! I get it that...,1,"[take, easy, ads, google, get, free, service, ...","[easy, ads]"
5029,youtube,You tube outstanding in information dessimination,You tube is outstanding in bringing out idiffe...,5,"[outstanding, bringing, idifferent, point, vie...","[outstanding, information, dessimination]"
5030,youtube,It is great for maybe you but for me.,I live somewhere close to the mountains in Cov...,5,"[live, somewhere, close, mountains, cove, ariz...","[great, maybe]"
5031,youtube,Horrible landscape experience,Revert the horrible change you made to how com...,2,"[revert, horrible, change, made, commentsdescr...","[horrible, landscape, experience]"


## Descriptive Stats for DF Columns and Counters

In [44]:
def descriptive_stats(df_col, top_x_tokens = 20, verbose=True) :
    counter = Counter()
    df_col.map(counter.update)
    num_tokens = sum(counter.values())
    num_unique_tokens = len(counter.keys())
    num_characters=0
    for key, value in counter.items():
        char = (len(key))*value
        num_characters = num_characters + char
    lexical_diversity = num_unique_tokens/num_tokens

    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
        print(f"The twenty most common tokens are:")
        print(counter.most_common(top_x_tokens))
        
    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])

def counter_descriptive_stats(counter, top_x_tokens = 20, verbose=True):    
    #counter = Counter()
    #df_col.map(counter.update)
    num_tokens = sum(counter.values())
    num_unique_tokens = len(counter.keys())
    num_characters=0
    for key, value in counter.items():
        char = (len(key))*value
        num_characters = num_characters + char
    lexical_diversity = num_unique_tokens/num_tokens

    if verbose :        
        print(f"There are {num_tokens} tokens in the data.")
        print(f"There are {num_unique_tokens} unique tokens in the data.")
        print(f"There are {num_characters} characters in the data.")
        print(f"The lexical diversity is {lexical_diversity:.3f} in the data.")
        print(f"The twenty most common tokens are:")
        print(counter.most_common(top_x_tokens))
        
    return([num_tokens, num_unique_tokens,
            lexical_diversity,
            num_characters])

In [45]:
descriptive_stats(df['review_onlywords'], top_x_tokens = 20, verbose=True)

There are 182131 tokens in the data.
There are 11289 unique tokens in the data.
There are 1013276 characters in the data.
The lexical diversity is 0.062 in the data.
The twenty most common tokens are:
[('app', 5020), ('watch', 2594), ('like', 2069), ('shows', 1860), ('im', 1446), ('get', 1404), ('show', 1373), ('dont', 1316), ('time', 1228), ('please', 1162), ('one', 1143), ('love', 1134), ('would', 1132), ('movies', 1113), ('even', 1087), ('cant', 989), ('good', 953), ('watching', 950), ('back', 937), ('video', 932)]


[182131, 11289, 0.061982858491964575, 1013276]

In [46]:
descriptive_stats(df['title_onlywords'], top_x_tokens = 20, verbose=True)

There are 13677 tokens in the data.
There are 2551 unique tokens in the data.
There are 77084 characters in the data.
The lexical diversity is 0.187 in the data.
The twenty most common tokens are:
[('app', 747), ('great', 296), ('good', 287), ('ads', 211), ('love', 186), ('please', 184), ('content', 157), ('shows', 132), ('work', 113), ('needs', 110), ('one', 107), ('read', 103), ('fix', 99), ('cant', 99), ('buggy', 98), ('many', 95), ('update', 95), ('terrible', 93), ('problem', 89), ('streaming', 88)]


[13677, 2551, 0.186517511150106, 77084]

## Looking at emojis

In [47]:
counter = Counter()
df['review'].map(counter.update)
num=0
for key, value in counter.most_common():
    if num <10:
        if is_emoji(key) is True:
            print(key, value)
            num+=1
    else:
        break

😡 82
❤ 79
😁 56
😭 54
👍 41
😊 30
🙏 29
😃 27
🏼 27
😍 25


In [48]:
counter = Counter()
df['title'].map(counter.update)
num=0
for key, value in counter.most_common():
    if num <10:
        if is_emoji(key) is True:
            print(key, value)
            num+=1
    else:
        break

😡 28
❤ 20
😁 16
👍 14
👎 12
🤬 11
⚠ 10
⭐ 10
😍 9
🤩 8


## Compare the 5 Streaming Corpora (Mainly from Mod 3)

### Create the 5 Counters for Analysis

In [49]:
disney = df[df['app_name'] == "disney"]
disney_counter = Counter()
disney['review_onlywords'].map(disney_counter.update)
disney_num_tokens = sum(disney_counter.values())
counter_descriptive_stats(disney_counter)

There are 29985 tokens in the data.
There are 4331 unique tokens in the data.
There are 165067 characters in the data.
The lexical diversity is 0.144 in the data.
The twenty most common tokens are:
[('app', 866), ('watch', 489), ('movies', 413), ('love', 388), ('like', 385), ('shows', 343), ('im', 238), ('would', 226), ('show', 213), ('dont', 210), ('get', 193), ('movie', 192), ('really', 189), ('please', 185), ('one', 181), ('good', 175), ('great', 172), ('watching', 169), ('tv', 160), ('cant', 157)]


[29985, 4331, 0.14443888610972153, 165067]

In [50]:
hbomax = df[df['app_name'] == "hbomax"]
hbomax_counter = Counter()
hbomax['review_onlywords'].map(hbomax_counter.update)
hbomax_num_tokens = sum(hbomax_counter.values())
counter_descriptive_stats(hbomax_counter)

There are 46524 tokens in the data.
There are 4617 unique tokens in the data.
There are 265180 characters in the data.
The lexical diversity is 0.099 in the data.
The twenty most common tokens are:
[('app', 1989), ('watch', 633), ('content', 490), ('download', 399), ('time', 392), ('shows', 379), ('even', 366), ('get', 353), ('im', 339), ('like', 318), ('show', 304), ('streaming', 304), ('tv', 303), ('downloaded', 287), ('cant', 281), ('would', 260), ('one', 260), ('episode', 255), ('movies', 254), ('play', 254)]


[46524, 4617, 0.09923910239876194, 265180]

In [51]:
hulu = df[df['app_name'] == "hulu"]
hulu_counter = Counter()
hulu['review_onlywords'].map(hulu_counter.update)
hulu_num_tokens = sum(hulu_counter.values())
counter_descriptive_stats(hulu_counter)

There are 24466 tokens in the data.
There are 3092 unique tokens in the data.
There are 134173 characters in the data.
The lexical diversity is 0.126 in the data.
The twenty most common tokens are:
[('app', 768), ('watch', 480), ('ads', 479), ('show', 335), ('shows', 268), ('get', 241), ('time', 206), ('im', 204), ('like', 198), ('even', 191), ('watching', 173), ('ad', 170), ('tv', 168), ('dont', 167), ('every', 165), ('episode', 162), ('cant', 157), ('play', 155), ('back', 150), ('fix', 141)]


[24466, 3092, 0.12637946538052808, 134173]

In [52]:
netflix = df[df['app_name'] == "netflix"]
netflix_counter = Counter()
netflix['review_onlywords'].map(netflix_counter.update)
netflix_num_tokens = sum(netflix_counter.values())
counter_descriptive_stats(netflix_counter)

There are 39421 tokens in the data.
There are 4824 unique tokens in the data.
There are 213863 characters in the data.
The lexical diversity is 0.122 in the data.
The twenty most common tokens are:
[('shows', 831), ('like', 674), ('watch', 636), ('app', 629), ('please', 495), ('show', 447), ('dont', 394), ('good', 386), ('im', 370), ('movies', 367), ('love', 338), ('one', 307), ('really', 297), ('would', 292), ('get', 280), ('add', 265), ('back', 265), ('watching', 254), ('people', 244), ('want', 217)]


[39421, 4824, 0.12237132492833769, 213863]

In [53]:
youtube = df[df['app_name'] == "youtube"]
youtube_counter = Counter()
youtube['review_onlywords'].map(youtube_counter.update)
youtube_num_tokens = sum(youtube_counter.values())
counter_descriptive_stats(youtube_counter)

There are 41735 tokens in the data.
There are 5962 unique tokens in the data.
There are 234993 characters in the data.
The lexical diversity is 0.143 in the data.
The twenty most common tokens are:
[('app', 768), ('video', 728), ('videos', 620), ('like', 494), ('watch', 356), ('ads', 349), ('get', 337), ('dont', 315), ('im', 295), ('time', 261), ('one', 256), ('would', 250), ('want', 232), ('please', 231), ('see', 226), ('back', 224), ('also', 221), ('people', 216), ('even', 208), ('love', 207)]


[41735, 5962, 0.14285371989936504, 234993]

## Working with Counters: Tokens Must Appear 3 Times in Every Corpora

In [54]:
disney_counter_2 = {key:value for (key, value) in disney_counter.items() if value >= 3}
hbomax_counter_2 = {key:value for (key, value) in hbomax_counter.items() if value >= 3}
hulu_counter_2 = {key:value for (key, value) in hulu_counter.items() if value >= 3}
netflix_counter_2 = {key:value for (key, value) in netflix_counter.items() if value >= 3}
youtube_counter_2 = {key:value for (key, value) in youtube_counter.items() if value >= 3}
#final_dict = {x:counter1_2[x] for x in counter1_2 if x in counter2_2}
final_dict = {x:[disney_counter_2[x]/disney_num_tokens,
                 hbomax_counter_2[x]/hbomax_num_tokens,
                 hulu_counter_2[x]/hulu_num_tokens,
                 netflix_counter_2[x]/netflix_num_tokens,
                 youtube_counter_2[x]/youtube_num_tokens]\
              for x in disney_counter_2 if (x in hbomax_counter_2) & (x in hulu_counter_2) &\
              (x in netflix_counter_2) & (x in youtube_counter_2)}

## Compute Ratios Between All Corpora Combinations and Extract Tokens

In [55]:
compare = pd.DataFrame(final_dict, index = ['disney', 'hbomax', 'hulu', 'netflix', 'youtube'] ).T ####TRANSPOSED!!!!

# 10 Possible Comparisons
# update: changed some variables named 'new' to 'compare' (was causing an error)

compare['disney_v_hbomax'] = compare['disney']/compare['hbomax']
compare['disney_v_hulu'] = compare['disney']/compare['hulu']
compare['disney_v_netflix'] = compare['disney']/compare['netflix']
compare['disney_v_youtube'] = compare['disney']/compare['youtube']

compare['hbomax_v_hulu'] = compare['hbomax']/compare['hulu']
compare['hbomax_v_netflix'] = compare['hbomax']/compare['netflix']
compare['hbomax_v_youtube'] = compare['hbomax']/compare['youtube']

compare['hulu_v_netflix'] = compare['hulu']/compare['netflix']
compare['hulu_v_youtube'] = compare['hulu']/compare['youtube']

compare['netflix_v_youtube'] = compare['netflix']/compare['youtube']

#10 Sorted Results
disney_v_hbomax = compare.sort_values(by='disney_v_hbomax', ascending=False).index
disney_v_hulu = compare.sort_values(by='disney_v_hulu', ascending=False).index
disney_v_netflix = compare.sort_values(by='disney_v_netflix', ascending=False).index
disney_v_youtube = compare.sort_values(by='disney_v_youtube', ascending=False).index
hbomax_v_hulu = compare.sort_values(by='hbomax_v_hulu', ascending=False).index
hbomax_v_netflix = compare.sort_values(by='hbomax_v_netflix', ascending=False).index
hbomax_v_youtube = compare.sort_values(by='hbomax_v_youtube', ascending=False).index
hulu_v_netflix = compare.sort_values(by='hulu_v_netflix', ascending=False).index
hulu_v_youtube = compare.sort_values(by='hulu_v_youtube', ascending=False).index
netflix_v_youtube = compare.sort_values(by='netflix_v_youtube', ascending=False).index

disney_v_hbomax

Index(['kids', 'kid', 'house', 'thank', 'family', 'fun', 'fan', 'older', 'u',
       'recommend',
       ...
       'website', 'title', 'ads', 'connection', 'mode', 'data', 'mobile',
       'online', 'downloads', 'offline'],
      dtype='object', length=721)

In [56]:
zipped = list(zip(disney_v_hbomax, disney_v_hulu, disney_v_netflix, disney_v_youtube, 
                  hbomax_v_hulu, hbomax_v_netflix, hbomax_v_youtube,
                  hulu_v_netflix, hulu_v_youtube,
                  netflix_v_youtube))
compare2 = pd.DataFrame(zipped, columns=['disney_v_hbomax', 'disney_v_hulu', 'disney_v_netflix', 'disney_v_youtube',
                                   'hbomax_v_hulu', 'hbomax_v_netflix', 'hbomax_v_youtube',
                                   'hulu_v_netflix', 'hulu_v_youtube',
                                   'netflix_v_youtube'])
print(compare2.head(15))
compare2.tail(15)

   disney_v_hbomax disney_v_hulu disney_v_netflix disney_v_youtube  \
0             kids          best          turning         episodes   
1              kid        family          channel           movies   
2            house         house            short            movie   
3            thank           fan             slow          profile   
4           family       amazing           create           series   
5              fun           kid          freezes              fan   
6              fan         world              tap        streaming   
7            older          kids             load         password   
8                u        thanks              job            shows   
9        recommend       believe              fun            email   
10       companies           big         friendly          casting   
11         channel         thank            speed        subtitles   
12         casting        movies          casting             kids   
13         missing  

Unnamed: 0,disney_v_hbomax,disney_v_hulu,disney_v_netflix,disney_v_youtube,hbomax_v_hulu,hbomax_v_netflix,hbomax_v_youtube,hulu_v_netflix,hulu_v_youtube,netflix_v_youtube
706,constant,seconds,awful,removed,normal,section,scroll,family,people,regular
707,stops,audio,share,website,history,thank,thank,ask,small,current
708,awful,starts,lately,previous,ridiculous,bring,interest,added,found,google
709,downloaded,waste,away,subscribe,pls,kid,seeing,u,bring,tap
710,save,close,section,removing,gonna,ones,higher,thanks,features,short
711,website,minutes,honestly,mode,volume,rated,recommendations,example,scrolling,save
712,title,ridiculous,yall,mobile,tired,taking,video,ones,best,subscribe
713,ads,sound,pls,history,casting,guys,adds,removed,information,button
714,connection,rewatch,mobile,save,per,add,side,friends,google,information
715,mode,mobile,ads,information,sound,removing,u,house,world,scroll


In [65]:
df

Unnamed: 0,app_name,title,review,rating,review_onlywords,title_onlywords
0,disney,MISSING many shows (and languages),I grew up watching many a Disney shows. Signed...,3,"[grew, watching, many, shows, signed, availabl...","[missing, many, shows, languages]"
1,disney,really quite upset,pretty sure this was deliberate. my old iphone...,1,"[pretty, sure, deliberate, old, iphone, basica...","[really, quite, upset]"
2,disney,Probably the worst streaming app.,We have just about every streaming app you can...,3,"[every, streaming, app, think, much, want, one...","[probably, worst, streaming, app]"
3,disney,Love it but could add more stuff,I live Disney! I grew up with it my whole life...,4,"[live, grew, whole, life, dont, know, app, tv,...","[love, could, add, stuff]"
4,disney,Frequently causes issues with connectivity,App works great through non-iOS devices but lo...,2,"[app, works, great, nonios, devices, locks, ev...","[frequently, causes, issues, connectivity]"
...,...,...,...,...,...,...
5027,youtube,Wonderful,"By one concern at this time, once you open you...",4,"[one, concern, time, open, queue, see, somethi...",[wonderful]
5028,youtube,Easy on the ads !!!!,Take it easy on the ads Google ! I get it that...,1,"[take, easy, ads, google, get, free, service, ...","[easy, ads]"
5029,youtube,You tube outstanding in information dessimination,You tube is outstanding in bringing out idiffe...,5,"[outstanding, bringing, idifferent, point, vie...","[outstanding, information, dessimination]"
5030,youtube,It is great for maybe you but for me.,I live somewhere close to the mountains in Cov...,5,"[live, somewhere, close, mountains, cove, ariz...","[great, maybe]"


## Working with DF and Lists: Naive Bayes

In [57]:
shortdb = pd.DataFrame(df, columns = ['review', 'app_name'])
shortlist = shortdb.values.tolist()
shortlist = [[t.translate(str.maketrans('', '', string.punctuation)),p] for t, p in shortlist]
shortlist = [[t.lower(),p] for t, p in shortlist]
shortlist[0:5]

[['i grew up watching many a disney shows signed up for disney most of them are not available the ones that are not available in my language or a decent selection at all ebglish and spanish only reallythere are official dubs that were aired on public television decades ago already and disney only has english for most of the shows if they are even on there to watch at least they finally have the clone wars magnificent animation as always tartakovsky is magnificent but i had to wait for that as well the app will crashfreeze if you hit the rewind button too often not for people that like watching specific scenes again and again and there is no option to either loop a video yes i would like to have biomes and vehicle on in the background while i worksleep or to make it stop asking “are you still there” so at some point it will stop playing and if you have loud neighbors throwing on dinsney to tune them out whilst you sleep is not an option the quality of the videos themselves is top notch 

## Introduce a Word Cut-Off

In [58]:
##### INTRODUCE A WORD CUT-OFF
word_cutoff = 5

tokens = [w for t, p in shortlist for w in t.split()]
print(tokens[0:5])

word_dist = nltk.FreqDist(tokens)

#for w, c in word_dist.items():
#    print(w, c)

feature_words = set()

for word, count in word_dist.items() :
    if count > word_cutoff :
        feature_words.add(word)
        
print(f"With a word cutoff of {word_cutoff}, we have {len(feature_words)} features in the model.")

['i', 'grew', 'up', 'watching', 'many']
With a word cutoff of 5, we have 2959 features in the model.


## Preprocessing

In [59]:
# Your code here
# ret_dict = dict()
# tokenize each piece of text into text_tokens
# for token in text_tokens if in fw:
# add token and True to a dictionary
# return(ret_dict)

def conv_features(review,fw):
    #ret_list = []
    ret_dict = dict()
    #text = text.translate(str.maketrans('', '', string.punctuation))
    #text = text.lower()
    review = [w for w in review.split() if w not in (set(sw))]
    
    for token in fw:
        if token in review:
            ret_dict.update({token: 1})
        else:
            ret_dict.update({token: 0})

    return(ret_dict)

In [60]:
featuresets = [(conv_features(review,feature_words), app_name) for (review, app_name) in shortlist]

In [61]:
#random.seed(20220507)
random.seed(84)
random.shuffle(featuresets)

test_size = 500

# Classifiers

## Classifier 1: NB Model

In [97]:
test_set, train_set = featuresets[:test_size], featuresets[test_size:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(sorted(classifier.labels()))
#print(classifier.labels())
print(nltk.classify.accuracy(classifier, test_set))

['disney', 'hbomax', 'hulu', 'netflix', 'youtube']
0.746


In [98]:
classifier.show_most_informative_features(100)

Most Informative Features
                 episode = 1              hbomax : youtub =     93.8 : 1.0
                  season = 1              netfli : youtub =     70.9 : 1.0
                     ads = 1                hulu : disney =     64.9 : 1.0
                  videos = 1              youtub : netfli =     57.6 : 1.0
                 seasons = 1              netfli : youtub =     56.5 : 1.0
                   buggy = 1              hbomax : youtub =     56.1 : 1.0
             commercials = 1                hulu : disney =     54.9 : 1.0
                comments = 1              youtub : hulu   =     53.8 : 1.0
                   music = 1              youtub : hulu   =     41.5 : 1.0
                  minute = 1                hulu : netfli =     38.0 : 1.0
                 comment = 1              youtub : hulu   =     35.7 : 1.0
                children = 1              disney : hulu   =     34.2 : 1.0
                  flight = 1              hbomax : netfli =     32.2 : 1.0

In [None]:
# dt_classifier = nltk.DecisionTreeClassifier.train(train_set)
# print(sorted(dt_classifier.labels()))
# #print(classifier.labels())
# print(nltk.classify.accuracy(dt_classifier, test_set))

## Classifier 2: SklearnClassifiers: LinearSVC (SVM), BernoulliNB,  LogisticRegression, SGD, NuSVC

In [105]:

svc_classif = SklearnClassifier(LinearSVC()).train(train_set)
print(sorted(svc_classif.labels()))
print(nltk.classify.accuracy(svc_classif, test_set))




['disney', 'hbomax', 'hulu', 'netflix', 'youtube']
0.682


In [109]:
bnb_classif = SklearnClassifier(BernoulliNB()).train(train_set)
print(sorted(bnb_classif.labels()))
print(nltk.classify.accuracy(bnb_classif, test_set))

['disney', 'hbomax', 'hulu', 'netflix', 'youtube']
0.732


In [110]:
logreg_classif = SklearnClassifier(LogisticRegression()).train(train_set)
print(sorted(logreg_classif.labels()))
print(nltk.classify.accuracy(logreg_classif, test_set))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['disney', 'hbomax', 'hulu', 'netflix', 'youtube']
0.694


In [111]:
sgd_classif = SklearnClassifier(SGDClassifier()).train(train_set)
print(sorted(sgd_classif.labels()))
print(nltk.classify.accuracy(sgd_classif, test_set))

['disney', 'hbomax', 'hulu', 'netflix', 'youtube']
0.678


In [112]:
nusvc_classif = SklearnClassifier(NuSVC()).train(train_set)
print(sorted(nusvc_classif.labels()))
print(nltk.classify.accuracy(nusvc_classif, test_set))

['disney', 'hbomax', 'hulu', 'netflix', 'youtube']
0.716


## Classifier 3: Maxent Classifier with GIS

In [102]:
test_set, train_set = featuresets[:test_size], featuresets[test_size:]

# ALGORITHMS = ['GIS', 'IIS', 'MEGAM', 'TADM']

# GIS = Generalized Iterative Scaling

maxent_classifier = nltk.MaxentClassifier.train(train_set, 
                                                algorithm='GIS', 
                                                max_iter=3)

print(sorted(maxent_classifier.labels()))

print(nltk.classify.accuracy(maxent_classifier, test_set))

  ==> Training (3 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.60944        0.206
             2          -1.60569        0.206
         Final          -1.60225        0.208
['disney', 'hbomax', 'hulu', 'netflix', 'youtube']
0.168


In [103]:
maxent_classifier.show_most_informative_features(100)

  -0.004 episode==1 and label is 'youtube'
  -0.003 season==1 and label is 'youtube'
  -0.003 ads==1 and label is 'disney'
  -0.003 commercials==1 and label is 'disney'
  -0.003 minute==1 and label is 'netflix'
  -0.003 seasons==1 and label is 'youtube'
  -0.003 buggy==1 and label is 'youtube'
  -0.003 comments==1 and label is 'hulu'
  -0.003 comments==1 and label is 'disney'
  -0.003 episodes==1 and label is 'youtube'
  -0.003 movie==1 and label is 'youtube'
  -0.003 videos==1 and label is 'netflix'
  -0.003 rid==1 and label is 'disney'
  -0.003 music==1 and label is 'hulu'
  -0.003 children==1 and label is 'hulu'
  -0.003 viewing==1 and label is 'disney'
  -0.003 anime==1 and label is 'disney'
  -0.003 removing==1 and label is 'disney'
  -0.003 streaming==1 and label is 'youtube'
  -0.002 comment==1 and label is 'hulu'
  -0.002 series==1 and label is 'youtube'
  -0.002 together==1 and label is 'hulu'
  -0.002 taken==1 and label is 'hulu'
  -0.002 paid==1 and label is 'netflix'
  -0.0

# Topic Modeling

## 1) Topic Modeling: Non-Negative Matrix Factorization (NMF) Model with CountVectorizer


In [79]:
# This function comes from the BTAP repo.

def display_topics(model, features, no_top_words=5):
    for topic, words in enumerate(model.components_):
        total = words.sum()
        largest = words.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print("  %s (%2.2f)" % (features[largest[i]], abs(words[largest[i]]*100.0/total)))

In [80]:
# Count Vectorizer

count_text_vectorizer = CountVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)
count_text_vectors = count_text_vectorizer.fit_transform(df["review"])
count_text_vectors.shape



(5032, 2828)

In [81]:
feat_names = count_text_vectorizer.get_feature_names_out()

df_count_text = pd.DataFrame.sparse.from_spmatrix(count_text_vectors, columns=feat_names)

df_count_text

Unnamed: 0,00,000,10,100,1000,1080p,11,12,13,14,...,yesterday,young,younger,youtube,youtuber,youtubers,yt,zero,zombies,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5027,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5030,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
nmf_count_text_model = NMF(n_components=5, random_state=314)

W_text_matrix = nmf_count_text_model.fit_transform(count_text_vectors)

H_text_matrix = nmf_count_text_model.components_



In [83]:
display_topics(nmf_count_text_model, count_text_vectorizer.get_feature_names_out())


Topic 00
  app (11.07)
  hbo (1.42)
  content (1.25)
  download (1.14)
  time (1.11)

Topic 01
  shows (8.93)
  netflix (8.64)
  movies (3.21)
  good (2.41)
  watch (2.28)

Topic 02
  watch (6.86)
  ads (3.21)
  video (2.85)
  videos (1.98)
  time (1.40)

Topic 03
  like (12.13)
  don (3.20)
  people (1.41)
  good (1.34)
  know (1.31)

Topic 04
  disney (13.85)
  movies (4.45)
  love (4.44)
  watch (2.01)
  plus (1.63)


## 2) Topic Modeling: Non-Negative Matrix Factorization (NMF) Model with TF-IDF Vectorizer


In [84]:
# TF-IDF  Vectorizer

tfidf_text_vectorizer = TfidfVectorizer(stop_words=stopwords, min_df=5, max_df=0.7)
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(df["review"])
tfidf_text_vectors.shape



(5032, 2828)

In [86]:
feat_names = tfidf_text_vectorizer.get_feature_names_out()

df_tfidf_text = pd.DataFrame.sparse.from_spmatrix(tfidf_text_vectors, columns=feat_names)

df_tfidf_text

Unnamed: 0,00,000,10,100,1000,1080p,11,12,13,14,...,yesterday,young,younger,youtube,youtuber,youtubers,yt,zero,zombies,zoom
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.084982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5027,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
5028,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
5029,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
5030,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
nmf_tfidf_text_model = NMF(n_components=5, random_state=314)

W_text_matrix = nmf_tfidf_text_model.fit_transform(tfidf_text_vectors)

H_text_matrix = nmf_tfidf_text_model.components_



In [88]:
display_topics(nmf_tfidf_text_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  app (2.99)
  download (1.02)
  hbo (0.90)
  time (0.88)
  content (0.87)

Topic 01
  netflix (3.50)
  shows (3.15)
  like (2.21)
  movies (1.64)
  watch (1.63)

Topic 02
  ads (12.39)
  ad (3.10)
  watch (2.31)
  minutes (2.01)
  minute (1.88)

Topic 03
  disney (9.29)
  love (3.42)
  movies (3.01)
  plus (2.19)
  watch (1.30)

Topic 04
  video (3.57)
  videos (2.66)
  like (1.01)
  screen (0.92)
  want (0.81)


## 3) Topic Modeling: Latent Semantic Analysis (LSA) Model with TF-IDF Vectorizer


In [93]:
# LSA Model

svd_para_model = TruncatedSVD(n_components = 5, random_state=314)
W_svd_para_matrix = svd_para_model.fit_transform(tfidf_text_vectors)
H_svd_para_matrix = svd_para_model.components_

In [94]:
display_topics(svd_para_model, tfidf_text_vectorizer.get_feature_names_out())


Topic 00
  app (1.60)
  watch (1.11)
  shows (0.96)
  like (0.83)
  netflix (0.76)

Topic 01
  disney (11.34)
  movies (8.53)
  shows (8.36)
  netflix (8.08)
  love (6.91)

Topic 02
  ads (1586.02)
  ad (443.32)
  watch (260.05)
  video (246.33)
  minute (225.53)

Topic 03
  disney (32.05)
  ads (10.54)
  plus (7.46)
  love (6.25)
  movies (4.35)

Topic 04
  video (6.98)
  videos (5.69)
  like (2.47)
  screen (1.77)
  button (1.56)


## 4) Topic Modeling: Latent Dirichlet Allocation (LDA) Model with TF-IDF Vectorizer


In [95]:
# LDA Model

lda_para_model = LatentDirichletAllocation(n_components = 5, random_state=314)
W_lda_para_matrix = lda_para_model.fit_transform(count_text_vectors)
H_lda_para_matrix = lda_para_model.components_

In [96]:
display_topics(lda_para_model, count_text_vectorizer.get_feature_names_out())


Topic 00
  app (5.65)
  watch (1.39)
  hbo (1.30)
  content (1.21)
  download (1.19)

Topic 01
  ads (4.86)
  watch (2.47)
  like (1.69)
  ad (1.64)
  video (1.37)

Topic 02
  netflix (5.16)
  shows (3.91)
  like (3.04)
  watch (2.51)
  good (1.87)

Topic 03
  app (4.72)
  phone (1.53)
  video (1.51)
  watch (1.44)
  fix (1.32)

Topic 04
  disney (5.03)
  love (2.79)
  like (2.16)
  app (1.97)
  movies (1.95)
