A simple Naive Bayes model

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame(columns=['text','twss'])

In [4]:
df

Unnamed: 0,text,twss


In [5]:
with open('data/twss-stories-parsed.txt') as f:
    for sentence in f.readlines():
        df = df.append({
            'text': sentence.strip(),
            'twss': 1
        }, ignore_index=True)

In [6]:
df

Unnamed: 0,text,twss
0,you have to unhinge your jaw to get it all in,1
1,i want to go down i want to go down,1
2,you need to get wet,1
3,it's twice as long as it is wide,1
4,i've never been this wet before,1
...,...,...
2086,just close your eyes and shove it in your mout...,1
2087,it was coming at my face,1
2088,will you be wet forever,1
2089,i like the long thick black ones,1


In [7]:
with open('data/fmylife-parsed.txt') as f:
    for sentence in f.readlines():
        df = df.append({
            'text': sentence.strip(),
            'twss': 0
        }, ignore_index=True)

In [8]:
df

Unnamed: 0,text,twss
0,you have to unhinge your jaw to get it all in,1
1,i want to go down i want to go down,1
2,you need to get wet,1
3,it's twice as long as it is wide,1
4,i've never been this wet before,1
...,...,...
5437,why am i annoyed at him because he wants to go...,0
5438,the only person who wished me a happy birthday...,0
5439,my doctor told me my asthma was being triggere...,0
5440,he wrote a note to my dad asking him to refrai...,0


In [35]:
with open('data/texts-from-last-night-parsed.txt') as f:
    for sentence in f.readlines():
        df = df.append({
            'text': sentence.strip(),
            'twss': 0
        }, ignore_index=True)

In [36]:
df['twss'].value_counts()

0    5796
1    2091
Name: twss, dtype: int64

In [37]:
df.describe()

Unnamed: 0,text,twss
count,7887,7887
unique,7846,2
top,its my birthday,0
freq,3,5796


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [38]:
X=df['text']
y=df['twss']

In [39]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [40]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5520,), (2367,), (5520,), (2367,))

### Saving model

In [87]:
import joblib

In [88]:
joblib.dump(clf, 'twss_model.sav')

['twss_model.sav']

In [89]:
clf_loaded = joblib.load('twss_model.sav')

In [90]:
docs_new = ['It\'s hard.', 'they will be held against you']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf_loaded.predict(X_new_tfidf)

### Applying components through pipelines

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [41]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [42]:
text_clf.fit(X_train, y_train.astype(int))

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [43]:
import joblib

In [44]:
joblib.dump(text_clf, 'twss_model.sav')

['twss_model.sav']

In [45]:
twss_model_loaded = joblib.load('twss_model.sav')

### Prediction

In [46]:
twss_model_loaded.predict([
    'Hello there'
])[0]

0

In [48]:
y_predicted=text_clf.predict(X_test)

In [49]:
for i, result in enumerate(y_predicted):
    if result==1:
        print(X_test.iloc[i], y_test.iloc[i])

hard you just have to put it in the right place 1
why oh why did i suck thise tits 0
its so thick i cant get my hands around it 1
ughh it hurts when i swallow 1
oh no its leaking white stuff 1
i rarely go in there 0
here put it in my hands and ill lick it from the bottom for you 1
well i would need to wet it down before i slide down it 1
oh yeah im fine i just had something stuck in my throat 1
oh oh my god im blowing all these guys 1
wow you got me all wet and sticky 1
i cant wait to shove that foot-long in my mouth 1
it went right in my mouth ew ew ew 1
its a good three inches deep you should be able to fit a nice big sized rod in there 1
i cant its too tight 1
come on put your mouth on it suck it suck it kathy 1
can we finish this orally my hand is starting to hurt 1
oh it feels so good you just insert here and squeeze 1
i think i can squeeze you in 1
i cant stretch it over the thingy 1
wow i didnt know what to expect and then it just came on my face 1
ive never done one this big be

In [50]:
np.mean(y_predicted == y_test)

0.9091677228559358