## `Reddit post title prediction - NSFW or SFW - Univariate`

In [2]:
import pandas as pd
from sqlalchemy import create_engine

In [3]:
engine = create_engine("postgresql://root:root@172.21.90.19:5432/general")

In [4]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x21cb1d4f850>

# getting data


In [5]:
not_nsfw = pd.read_sql(sql="select title,is_nsfw from reddit_data.hot_posts where random()<0.5 and not is_nsfw limit 20000;",con=engine)
nsfw = pd.read_sql(sql="select title,is_nsfw from reddit_data.hot_posts where random()<0.5 and  is_nsfw limit 20000;",con=engine)

In [6]:
df = pd.concat([nsfw,not_nsfw]).reset_index(drop=True)

In [7]:
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
df

Unnamed: 0,title,is_nsfw
0,22 [M4M] fit sexy Spanish soccerplayer [dom] l...,True
1,👅😈🍆,True
2,Redhead,True
3,What would be your initial reaction?,False
4,Gordon Jacob: Concerto for Horn and Strings,False
...,...,...
39995,"Women, what are some questions you’ve always w...",True
39996,Can you Separate the Art from the Artist?,False
39997,"{Comic} Canon Crossing (Art by Me, Derilect) (...",True
39998,Suck my toes or suck my ass?,True


### `Label Encoding`

False - 0
True - 1

In [9]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['is_nsfw'])

In [10]:
df

Unnamed: 0,title,is_nsfw,label
0,22 [M4M] fit sexy Spanish soccerplayer [dom] l...,True,1
1,👅😈🍆,True,1
2,Redhead,True,1
3,What would be your initial reaction?,False,0
4,Gordon Jacob: Concerto for Horn and Strings,False,0
...,...,...,...
39995,"Women, what are some questions you’ve always w...",True,1
39996,Can you Separate the Art from the Artist?,False,0
39997,"{Comic} Canon Crossing (Art by Me, Derilect) (...",True,1
39998,Suck my toes or suck my ass?,True,1


In [11]:
df.label.value_counts()

label
1    20000
0    20000
Name: count, dtype: int64

In [12]:
df.title= df.title.apply(lambda x:x.lower())

### `Preprocessing the text`

Now we will do multi level preprocessing on the text using different algorithms:

* Tokenize
* remove punctuations and stopwords
* stemmming

In [13]:
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer

In [14]:
### tokenize the sentence and remove punctuations

def tokenize(sent):
    sent = nltk.word_tokenize(sent)
    sent_mod = []
    for i in sent:
        if i.isalnum():
            sent_mod.append(i)
    return sent_mod

df.title = df.title.apply(tokenize)


In [15]:
df.head()

Unnamed: 0,title,is_nsfw,label
0,"[22, m4m, fit, sexy, spanish, soccerplayer, do...",True,1
1,[],True,1
2,[redhead],True,1
3,"[what, would, be, your, initial, reaction]",False,0
4,"[gordon, jacob, concerto, for, horn, and, stri...",False,0


In [16]:
### Removing Stopwords,Punctuation 

stopword = stopwords.words('english')

def remove_stopwords(sent):
    sent = [i for i in sent if i not in stopword and i not in string.punctuation]
    return sent

df.title = df.title.apply(remove_stopwords)

In [17]:
df.head()

Unnamed: 0,title,is_nsfw,label
0,"[22, m4m, fit, sexy, spanish, soccerplayer, do...",True,1
1,[],True,1
2,[redhead],True,1
3,"[would, initial, reaction]",False,0
4,"[gordon, jacob, concerto, horn, strings]",False,0


In [18]:
### Applying stemming - using Porter Stemmer

ps = PorterStemmer()

def stemmer_ps(sent):
    sent_mod = [ps.stem(i) for i in sent]
    return ' '.join(sent_mod)

df.title = df.title.apply(stemmer_ps)

In [19]:
df.head()

Unnamed: 0,title,is_nsfw,label
0,22 m4m fit sexi spanish soccerplay dom look su...,True,1
1,,True,1
2,redhead,True,1
3,would initi reaction,False,0
4,gordon jacob concerto horn string,False,0


### `Feature and Label`

In [20]:
X = df.title
y = df.label

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tfidf  = TfidfVectorizer()
cv = CountVectorizer()
feature_tfidf = tfidf.fit_transform(X)
feature_cv = cv.fit_transform(X)

In [22]:
feature_tfidf.shape

(40000, 20092)

In [23]:
feature_cv

<40000x20092 sparse matrix of type '<class 'numpy.int64'>'
	with 201348 stored elements in Compressed Sparse Row format>

## `Bag Of words`

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(feature_cv,y,test_size=0.2)

In [25]:
from sklearn import svm
#from sklearn.model_selection import GridSearchCV
#tuned_parameters = {'kernel':['linear','rbf'],'gamma':[1e-3,1e-4], 'C':[1,10,100,1000]} 
#model = GridSearchCV(svm.SVC(),tuned_parameters)

model = svm.SVC()
model.fit(X_train, y_train)

In [26]:
import numpy as np

In [27]:
# training set score - this to check overfitting 
model.score(X_train,np.array(y_train))

0.9594375

In [28]:
y_pred = model.predict(X_test)

# test set score
model.score(X_test,np.array(y_test))

0.871125

## `TF-IDF`

In [29]:
X_train,X_test,y_train,y_test = train_test_split(feature_tfidf,y,test_size=0.2)
model_tf = svm.SVC()
model_tf.fit(X_train, y_train)

In [30]:
# training set score - this to check overfitting 
model_tf.score(X_train,np.array(y_train))

0.97665625

In [31]:
y_pred = model_tf.predict(X_test)

# test set score
model_tf.score(X_test,np.array(y_test))

0.889625