In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from nltk import word_tokenize
from nltk.corpus import stopwords
import nltk
# nltk.download()

In [3]:
fml = pd.read_csv('data/fml.txt',sep="\n",header=None)
fml.columns=['text']
fml.head()

Unnamed: 0,text
0,I downloaded an application for my phone that ...
1,"My girlfriend said ""It's funny how every time ..."
2,My girlfriend told me she was bored. During sex.
3,"I have a condition that, when I pull my foresk..."
4,My boyfriend decided to give me a hickey. Too ...


In [4]:
tfln = pd.read_csv('data/tfln.onesent.txt',sep="\n",header=None,encoding='latin')
tfln.columns=['text']
tfln.head()

Unnamed: 0,text
0,she pulled the sheets over her head to blow me...
1,He just ordered a bottle of Beam at an Italian...
2,Well if my looks don't work with her I'll eat ...
3,"If you don't remember anything tomorrow, this ..."
4,We decided to cut you off after you insisted o...


In [5]:
usa = pd.read_csv('data/usaquotes.txt',sep="\n",header=None,error_bad_lines=False)
usa.columns = ['text']
usa.head()

b'Skipping line 580: expected 1 fields, saw 2\nSkipping line 627: expected 1 fields, saw 2\nSkipping line 639: expected 1 fields, saw 2\nSkipping line 646: expected 1 fields, saw 2\nSkipping line 650: expected 1 fields, saw 2\nSkipping line 651: expected 1 fields, saw 3\nSkipping line 733: expected 1 fields, saw 6\nSkipping line 790: expected 1 fields, saw 2\nSkipping line 837: expected 1 fields, saw 4\nSkipping line 851: expected 1 fields, saw 2\nSkipping line 920: expected 1 fields, saw 3\nSkipping line 977: expected 1 fields, saw 2\nSkipping line 989: expected 1 fields, saw 2\nSkipping line 1245: expected 1 fields, saw 2\nSkipping line 1626: expected 1 fields, saw 3\nSkipping line 1796: expected 1 fields, saw 2\nSkipping line 1831: expected 1 fields, saw 3\nSkipping line 2021: expected 1 fields, saw 2\nSkipping line 2125: expected 1 fields, saw 2\nSkipping line 2206: expected 1 fields, saw 2\nSkipping line 2211: expected 1 fields, saw 2\nSkipping line 2624: expected 1 fields, saw 2\

Unnamed: 0,text
0,"We have too many high sounding words, and too ..."
1,The reins of government have been so long slac...
2,Shall we be despised by foreign powers for hes...
3,Deliver me from your cold phlegmatic preachers...
4,I regret the narrow contracted education of th...


In [6]:
twss = pd.read_csv('data/twssstories.txt',sep="\n",header=None,encoding='latin')
twss.columns=['text']
twss.head()

Unnamed: 0,text
0,"Put it back in, it's still moist in the middle."
1,"Now if I can just get it in the hole, I'll sho..."
2,...a great penetration!
3,"That just kind of squirted out there, didn't it!"
4,"Oh it feels so good, you just insert here and ..."


In [7]:
print("fml : ",len(fml))
print("tfln : ",len(tfln))
print("usa : ",len(usa))
print("twss : ",len(twss))

fml :  1767
tfln :  11869
usa :  4066
twss :  2027


In [8]:
twss['target'] = [1]*2027
twss.head()

Unnamed: 0,text,target
0,"Put it back in, it's still moist in the middle.",1
1,"Now if I can just get it in the hole, I'll sho...",1
2,...a great penetration!,1
3,"That just kind of squirted out there, didn't it!",1
4,"Oh it feels so good, you just insert here and ...",1


In [13]:
usa = usa.loc[:666,:]
tfln = tfln.loc[:666,:]
fml = fml.loc[:666,:]

In [18]:
usa['target'] = [0]*667

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
tfln['target'] = [0]*667
fml['target'] = [0]*667

In [28]:
df = pd.concat([twss,usa,tfln,fml],ignore_index=True)

In [32]:
df

Unnamed: 0,text,target
0,"Put it back in, it's still moist in the middle.",1
1,"Now if I can just get it in the hole, I'll sho...",1
2,...a great penetration!,1
3,"That just kind of squirted out there, didn't it!",1
4,"Oh it feels so good, you just insert here and ...",1
...,...,...
4023,While my wife was watching me get undressed sh...,0
4024,My boyfriend came to my workplace and presente...,0
4025,"I was laying on the bed, naked, waiting for my...",0
4026,I hung out with the guy I've liked for the fir...,0


In [33]:
df = df.sample(frac=1).reset_index(drop=True)

In [34]:
df

Unnamed: 0,text,target
0,Deliver me from your cold phlegmatic preachers...,0
1,"Oh I don't want to do this one, it's too long.",1
2,There you go. Push it up into her.,1
3,"After all, it is not where one washes one’s ne...",0
4,her roommates boyfriend drunkenly walked in on...,0
...,...,...
4023,"Good Americans, when they die, go to Paris.",0
4024,That hit would have scored me so many points!,1
4025,I woke up in bed with no pillows. I think the ...,0
4026,You're gonna get in it and you're gonna like it.,1


In [39]:
y = df['target']
df.drop(['target'],inplace=True,axis=1)

In [42]:

xtrain,xvalid,ytrain,yvalid= train_test_split(df.text.values,y,stratify=y,random_state=42,test_size=0.1,shuffle=True)

In [45]:
print(xtrain.shape,xvalid.shape)

(3625,) (403,)


In [46]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)


In [49]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict(xvalid_tfv)

print(metrics.accuracy_score(predictions,yvalid))

0.8957816377171216


In [50]:
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict(xvalid_tfv)

print(metrics.accuracy_score(predictions,yvalid))

0.9156327543424317


In [53]:
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict(xvalid_svd_scl)

print(metrics.accuracy_score(predictions,yvalid))

0.8833746898263027


In [54]:
import xgboost

xgb = xgboost.XGBClassifier(n_thread=10)
xgb.fit(xtrain_tfv,ytrain)

predictions = xgb.predict(xvalid_tfv)

print(metrics.accuracy_score(predictions,yvalid))

ModuleNotFoundError: No module named 'xgboost'