In [1]:
import pandas as pd
from sqlalchemy import create_engine

# `USING OWN CREATED MODEL`

In [2]:
engine = create_engine("postgresql://root:root@localhost:5432/general")

In [3]:
not_nsfw = pd.read_sql(sql="select title,is_nsfw from reddit_data.hot_posts where not is_nsfw order by random() limit 100000;",con=engine)
nsfw = pd.read_sql(sql="select title,is_nsfw from reddit_data.hot_posts where is_nsfw order by random() limit 100000;",con=engine)

In [4]:
df = pd.concat([nsfw,not_nsfw]).reset_index(drop=True)

In [5]:
df = df.sample(frac=1).reset_index(drop=True)

In [6]:
df

Unnamed: 0,title,is_nsfw
0,1938 home husband inherited,False
1,Always wearing this it makes me ready to be used,True
2,Am I petite and cute enough for you,True
3,They are waiting for you,True
4,Are you ready to eat every single toe?,True
...,...,...
199995,God's inclusion of stories in the Bible is fra...,False
199996,SAS Soldiers fighting Iranian agents in secret...,False
199997,Do you like my cock?,True
199998,My mom’s plant decor,False


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['is_nsfw'])

In [8]:
df.head()

Unnamed: 0,title,is_nsfw,label
0,1938 home husband inherited,False,0
1,Always wearing this it makes me ready to be used,True,1
2,Am I petite and cute enough for you,True,1
3,They are waiting for you,True,1
4,Are you ready to eat every single toe?,True,1


In [9]:
X = df.title
y = df.label

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [11]:
X_train.head()


60994                               This guy seems like fun
83152            How do I remove the date from at a glance?
34558                                    23 wanna compare ?
100476                Rachel Aldana vs Leanne Crow Timeline
134092    I wish I were cute and feminine and could pass...
Name: title, dtype: object

## Training

In [12]:
X_train= X_train.apply(lambda x:x.lower())

In [13]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import string

In [14]:
### tokenize the sentence and remove punctuations

def tokenize(sent):
    sent = nltk.word_tokenize(sent)
    sent_mod = []
    for i in sent:
        sent_mod.append(i)
    return sent_mod

X_train = X_train.apply(tokenize)

In [15]:
### Removing Stopwords,Punctuation 

stopword = stopwords.words('english')

def remove_stopwords(sent):
    sent = [i for i in sent if i not in stopword and i not in string.punctuation]
    return sent

X_train = X_train.apply(remove_stopwords)

In [16]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else :
        return wordnet.NOUN

In [17]:
### Applying stemming - using Lemmatizer

wnl = WordNetLemmatizer()

def lemmatiser(sent):
    words_and_tags = nltk.pos_tag(sent)
    final_sent = []
    for word,tags in words_and_tags:
        tag = get_wordnet_pos(tags)
        word_lemma  = wnl.lemmatize(word,pos=tag)
        final_sent.append(word_lemma)
    final_sent = ' '.join(final_sent)
    return final_sent.strip()

X_train = X_train.apply(lemmatiser)

In [18]:
import re

In [19]:
from gensim.models import Word2Vec,KeyedVectors

text_gensim = X_train.to_list()
text_gensim = [re.sub(r'\s+',' ',i) for i in text_gensim]
text_gensim = [nltk.word_tokenize(i) for i in text_gensim]

In [20]:
len(text_gensim)

160000

In [21]:
model = Word2Vec(sentences=text_gensim)
word_vectors = model.wv

In [22]:
da  = pd.DataFrame({'x':text_gensim,'y':y_train})

In [23]:
da['len_x'] = da.x.apply(lambda x:len(x))

In [25]:
da = da[da.len_x!=0]

In [27]:
da

Unnamed: 0,x,y,len_x
60994,"[guy, seem, like, fun]",0,4
83152,"[remove, date, glance]",0,3
34558,"[23, wan, na, compare]",1,4
100476,"[rachel, aldana, v, leanne, crow, timeline]",1,6
134092,"[wish, cute, feminine, could, pass, girl]",0,6
...,...,...,...
183003,"[’, small]",1,2
192220,[oc],0,1
128314,"[make, stand, headset]",0,3
146181,"[boob, mom, 2]",1,3


In [29]:
def get_x_vectorized(text_token):
    return word_vectors.get_mean_vector(text_token)


In [31]:
text_gensim_vec = [get_x_vectorized(i) for i in da.x]

In [32]:
da['x_vec'] = text_gensim_vec

In [33]:
da

Unnamed: 0,x,y,len_x,x_vec
60994,"[guy, seem, like, fun]",0,4,"[0.029610397, 0.049926274, 0.029803107, 0.0688..."
83152,"[remove, date, glance]",0,3,"[-0.060810313, 0.08946868, -0.050391585, 0.103..."
34558,"[23, wan, na, compare]",1,4,"[0.036115963, 0.06273771, 0.04323062, -0.09034..."
100476,"[rachel, aldana, v, leanne, crow, timeline]",1,6,"[-0.08395818, 0.13880354, 0.009312073, 0.01044..."
134092,"[wish, cute, feminine, could, pass, girl]",0,6,"[0.032263722, 0.086477734, 0.05379193, -0.0060..."
...,...,...,...,...
183003,"[’, small]",1,2,"[0.05057542, 0.12635005, 0.13668688, 0.0216707..."
192220,[oc],0,1,"[-0.08823522, -0.09544157, 0.100402035, -0.127..."
128314,"[make, stand, headset]",0,3,"[-0.104830086, 0.12267765, 0.026212111, 0.0507..."
146181,"[boob, mom, 2]",1,3,"[0.05210225, 0.0372718, 0.07656348, -0.0172622..."


In [43]:
from sklearn import svm
#from sklearn.model_selection import GridSearchCV
#tuned_parameters = {'kernel':['linear','rbf'],'gamma':[1e-3,1e-4], 'C':[1,10,100,1000]} 
#model = GridSearchCV(svm.SVC(),tuned_parameters)

#model = svm.SVC()
model = svm.LinearSVC()
model.fit(da.x_vec.to_list(), da.y.to_list())

In [44]:
# training set score - this to check overfitting 
model.score(da.x_vec.to_list(), da.y.to_list())

0.8299232816658838

## Testing

In [45]:
X_test = X_test.apply(lambda x:x.lower())
X_test = X_test.apply(tokenize)
X_test = X_test.apply(remove_stopwords)
X_test = X_test.apply(lemmatiser)
text_gensim = X_test.to_list()
text_gensim = [re.sub(r'\s+',' ',i) for i in text_gensim]
text_gensim = [nltk.word_tokenize(i) for i in text_gensim]
dt  = pd.DataFrame({'x':text_gensim,'y':y_test})
dt['len_x'] = dt.x.apply(lambda x:len(x))
dt = dt[dt.len_x!=0]

In [46]:
dt.head()

Unnamed: 0,x,y,len_x
2994,"[rebecca, fanart, screenshot, redraw]",1,4
49708,"[let, ’, suppose, ’, scrap, file, would, like,...",0,19
43153,[reddit],0,1
58876,"[good, thermos, flask, actually, keep, coffee,...",0,7
68891,"[shenhe, genshin, impact]",1,3


In [47]:
text_gensim_vec = [get_x_vectorized(i) for i in dt.x]
dt['x_vec'] = text_gensim_vec

In [54]:
dt.x_vec

2994      [-0.111671455, 0.13170925, -0.009242443, 0.001...
49708     [-0.056745067, 0.11752575, 0.0065619377, -0.00...
43153     [0.004731347, 0.056844335, -0.03490187, 0.1213...
58876     [-0.016406097, 0.14520334, 0.009666053, -0.000...
68891     [-0.16232045, 0.13924532, 0.020662121, -0.0631...
                                ...                        
54206     [-0.08346627, 0.0731326, 0.042276204, -0.03642...
169454    [-0.100824416, 0.11564681, -0.02224978, 0.0657...
38797     [0.043094948, 0.094253145, 0.11267813, -0.0807...
193813    [-0.011061694, 0.12211296, 0.017117428, 0.0448...
123583    [0.016448097, 0.113897845, 0.07535643, -0.0419...
Name: x_vec, Length: 39919, dtype: object

In [55]:
y_pred = model.predict(dt.x_vec.to_list())

# test set score
model.score(dt.x_vec.to_list(), dt.y.to_list())

0.8290287832861545