## `Reddit post title prediction - NSFW or SFW - Univariate`

In [39]:
import pandas as pd
from sqlalchemy import create_engine

In [3]:
engine = create_engine("postgresql://root:root@172.17.137.99:5432/general")

In [4]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f26d749f520>

# getting data


In [5]:
not_nsfw = pd.read_sql(sql="select title,is_nsfw from reddit_data.hot_posts where random()<0.5 and not is_nsfw limit 20000;",con=engine)
nsfw = pd.read_sql(sql="select title,is_nsfw from reddit_data.hot_posts where random()<0.5 and  is_nsfw limit 20000;",con=engine)

In [6]:
df = pd.concat([nsfw,not_nsfw]).reset_index(drop=True)

In [7]:
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
df

Unnamed: 0,title,is_nsfw
0,Ya u can taste my pussy juice [grl],True
1,Rear USB jack for PSVR2 via adapter,False
2,be the batman to my robin? 👉👈,True
3,Front or back?,True
4,I want to manifest away back pain,False
...,...,...
39995,Teen petite and soft tits..,True
39996,Hr14 good armour suggestions,False
39997,I don't really like Bras,True
39998,"Possible issues with the ""Living Remain"" game....",False


### `Label Encoding`

False - 0
True - 1

In [9]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['is_nsfw'])

In [10]:
df

Unnamed: 0,title,is_nsfw,label
0,Ya u can taste my pussy juice [grl],True,1
1,Rear USB jack for PSVR2 via adapter,False,0
2,be the batman to my robin? 👉👈,True,1
3,Front or back?,True,1
4,I want to manifest away back pain,False,0
...,...,...,...
39995,Teen petite and soft tits..,True,1
39996,Hr14 good armour suggestions,False,0
39997,I don't really like Bras,True,1
39998,"Possible issues with the ""Living Remain"" game....",False,0


In [11]:
df.label.value_counts()

1    20000
0    20000
Name: label, dtype: int64

In [12]:
df.title= df.title.apply(lambda x:x.lower())

### `Preprocessing the text`

Now we will do multi level preprocessing on the text using different algorithms:

* Tokenize
* remove punctuations and stopwords
* stemmming

In [13]:
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer

In [14]:
### tokenize the sentence and remove punctuations

def tokenize(sent):
    sent = nltk.word_tokenize(sent)
    sent_mod = []
    for i in sent:
        if i.isalnum():
            sent_mod.append(i)
    return sent_mod

df.title = df.title.apply(tokenize)


In [15]:
df.head()

Unnamed: 0,title,is_nsfw,label
0,"[ya, u, can, taste, my, pussy, juice, grl]",True,1
1,"[rear, usb, jack, for, psvr2, via, adapter]",False,0
2,"[be, the, batman, to, my, robin]",True,1
3,"[front, or, back]",True,1
4,"[i, want, to, manifest, away, back, pain]",False,0


In [16]:
### Removing Stopwords,Punctuation 

stopword = stopwords.words('english')

def remove_stopwords(sent):
    sent = [i for i in sent if i not in stopword and i not in string.punctuation]
    return sent

df.title = df.title.apply(remove_stopwords)

In [17]:
df.head()

Unnamed: 0,title,is_nsfw,label
0,"[ya, u, taste, pussy, juice, grl]",True,1
1,"[rear, usb, jack, psvr2, via, adapter]",False,0
2,"[batman, robin]",True,1
3,"[front, back]",True,1
4,"[want, manifest, away, back, pain]",False,0


In [18]:
### Applying stemming - using Porter Stemmer

ps = PorterStemmer()

def stemmer_ps(sent):
    sent_mod = [ps.stem(i) for i in sent]
    return ' '.join(sent_mod)

df.title = df.title.apply(stemmer_ps)

In [19]:
df.head()

Unnamed: 0,title,is_nsfw,label
0,ya u tast pussi juic grl,True,1
1,rear usb jack psvr2 via adapt,False,0
2,batman robin,True,1
3,front back,True,1
4,want manifest away back pain,False,0


### `Feature and Label`

In [20]:
X = df.title
y = df.label

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tfidf  = TfidfVectorizer()
cv = CountVectorizer()
feature_tfidf = tfidf.fit_transform(X_train)
feature_cv = cv.fit_transform(X_train)

In [23]:
feature_tfidf.shape

(32000, 16612)

In [24]:
feature_cv

<32000x16612 sparse matrix of type '<class 'numpy.int64'>'
	with 172189 stored elements in Compressed Sparse Row format>

In [25]:
feature_test_tfidf = tfidf.transform(X_test)
feature_test_cv = cv.transform(X_test)

In [26]:
feature_test_tfidf

<8000x16612 sparse matrix of type '<class 'numpy.float64'>'
	with 41231 stored elements in Compressed Sparse Row format>

In [27]:
feature_test_cv

<8000x16612 sparse matrix of type '<class 'numpy.int64'>'
	with 41231 stored elements in Compressed Sparse Row format>

## `Bag Of words`

In [29]:
from sklearn import svm
#from sklearn.model_selection import GridSearchCV
#tuned_parameters = {'kernel':['linear','rbf'],'gamma':[1e-3,1e-4], 'C':[1,10,100,1000]} 
#model = GridSearchCV(svm.SVC(),tuned_parameters)

model = svm.LinearSVC()
model.fit(feature_cv, y_train)

In [30]:
import numpy as np

In [31]:
# training set score - this to check overfitting 
model.score(feature_cv,np.array(y_train))

0.9615625

In [33]:
y_pred = model.predict(feature_test_cv)

# test set score
model.score(feature_test_cv,np.array(y_test))

0.881375

## `TF-IDF`

In [34]:
model_tf = svm.LinearSVC()
model_tf.fit(feature_tfidf, y_train)

In [36]:
# training set score - this to check overfitting 
model_tf.score(feature_tfidf,np.array(y_train))

0.95946875

In [38]:
y_pred = model_tf.predict(feature_test_tfidf)

# test set score
model_tf.score(feature_test_tfidf,np.array(y_test))

0.88575