In [25]:
import zipfile

In [26]:
with zipfile.ZipFile('./data/smsspamcollection.zip', 'r') as fin:
    fin.extractall('./data')

In [27]:
import pandas as pd
sms = pd.read_csv("./data/SMSSpamCollection",sep = '\t',header=None)

In [28]:
sms.columns = ['spam','message']
sms.head(5)

Unnamed: 0,spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
docs_lower = [d.lower() for d in sms['message']]

In [30]:
def count_word(word, sentence):
    tokens = sentence.split()
    return len([w for w in tokens if w == word])

free_counts = [count_word('free', d) for d in docs_lower]
df = pd.DataFrame(free_counts, columns=['free'])

In [31]:
import re
def count_numbers(sentence):
    return len(re.findall('[0-9]', sentence))

df['num_char'] = [count_numbers(d) for d in docs_lower]


In [32]:
df.head()

Unnamed: 0,free,num_char
0,0,0
1,0,0
2,1,25
3,0,0
4,0,0


In [33]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [34]:
def split_fit_eval(X, y, model=None,epochs=10,random_state=0):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
    if not model:
        model = Sequential()
        model.add(Dense(1, input_dim=X.shape[1],activation='sigmoid'))
        model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
        h = model.fit(X_train, y_train,epochs=epochs,verbose=0)
        loss, acc = model.evaluate(X_test, y_test)
        return loss, acc, model, h

In [37]:
y = sms.spam.apply(lambda x: 1 if x == 'spam' else 0)
res = split_fit_eval(df.values, y)



In [38]:
print("Simple model accuracy: {:0.3f}".format(res[1]))

Simple model accuracy: 0.974


In [39]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(df.values,y)
dummy_clf.score(df.values,y)

0.8659368269921034

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
vocab_size =3000
vect = CountVectorizer(decode_error='ignore',
            stop_words='english',
            lowercase=True,
            max_features=vocab_size)
X = vect.fit_transform(sms['message'])
X

<5572x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 37142 stored elements in Compressed Sparse Row format>

In [41]:
Xd = X.todense()
vocab = vect.get_feature_names()
vocab[:10]

['00',
 '000',
 '02',
 '0207',
 '02073162414',
 '03',
 '04',
 '05',
 '06',
 '07123456789']

In [42]:
vocab[-10:]

['yogasana', 'yor', 'yr', 'yrs', 'yummy', 'yun', 'yunny', 'yuo', 'yup', 'zed']

In [43]:
res = split_fit_eval(Xd, y)
print("Test set accuracy:\t{:0.3f}".format(res[1]))

Test set accuracy:	0.978


In [44]:
model = res[2]
w_ = model.get_weights()[0].ravel()
vocab_weights = pd.Series(w_, index=vocab)
vocab_weights.sort_values(ascending=False).head(20)

txt         0.546782
claim       0.544486
www         0.539658
mobile      0.507972
uk          0.496187
150p        0.490551
prize       0.486169
stop        0.474600
free        0.448031
18          0.439172
reply       0.439141
urgent      0.424959
service     0.424378
won         0.406233
landline    0.404342
50          0.392148
16          0.381788
video       0.377617
cash        0.374465
text        0.369698
dtype: float32

In [45]:
vocab_weights.sort_values(ascending=False).tail(20)

sure    -0.438008
lt      -0.441792
fine    -0.443365
gt      -0.455829
doing   -0.462819
lor     -0.476845
home    -0.479585
like    -0.480804
got     -0.489728
did     -0.493081
later   -0.499145
lol     -0.503934
oh      -0.505780
going   -0.510569
da      -0.522192
come    -0.524024
good    -0.527056
sorry   -0.536195
ok      -0.607755
ll      -0.630998
dtype: float32

In [49]:
reviews = pd.read_csv('./data/rotten_tomatoes_critic_reviews.csv')
movies = pd.read_csv('./data/rotten_tomatoes_movies.csv')
movies.head()

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3


In [50]:
reviews.head()

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [52]:
df = reviews[['rotten_tomatoes_link','review_content','review_type']]
df.head()

Unnamed: 0,rotten_tomatoes_link,review_content,review_type
0,m/0814255,A fantasy adventure that fuses Greek mythology...,Fresh
1,m/0814255,"Uma Thurman as Medusa, the gorgon with a coiff...",Fresh
2,m/0814255,With a top-notch cast and dazzling special eff...,Fresh
3,m/0814255,Whether audiences will get behind The Lightnin...,Fresh
4,m/0814255,What's really lacking in The Lightning Thief i...,Rotten


In [54]:
df['review_type'].value_counts() / len(df)

Fresh     0.637344
Rotten    0.362656
Name: review_type, dtype: float64