In [1]:
import pandas as pd
import nltk
import numpy as np


In [2]:
from nltk.corpus import stopwords

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes 
from sklearn.metrics import roc_auc_score



In [4]:
traindata=pd.read_csv("training.txt",sep='\t',names=['liked','text'])

In [5]:
testdata=pd.read_csv("testdata.txt",sep='\t',names=['text'])

In [6]:
traindata.head(10)

Unnamed: 0,liked,text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
5,1,that's not even an exaggeration ) and at midni...
6,1,"I loved the Da Vinci Code, but now I want some..."
7,1,"i thought da vinci code was great, same with k..."
8,1,The Da Vinci Code is actually a good movie...
9,1,I thought the Da Vinci Code was a pretty good ...


In [7]:
testdata.head(10) #testdata has only text no 0/1 are mentioned

Unnamed: 0,text
0,"I don't care what anyone says, I like Hillary..."
1,"harvard is dumb, i mean they really have to be..."
2,I'm loving Shanghai > > > ^ _ ^.
3,harvard is for dumb people.
4,"As i stepped out of my beautiful Toyota, i hea..."
5,"Bodies being dismembered, blown apart, and mut..."
6,I love Harvard Square in the fall.
7,London = amazing...
8,I HATE LONDON!..
9,I love MIT so much...


In [8]:
stopset=set(stopwords.words('english'))

In [9]:
stopset

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [10]:
vectorizer=TfidfVectorizer(use_idf=True,lowercase=True,strip_accents='ascii',stop_words=stopset)

In [11]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'his', 'a', "weren't", 'me', 'had', "mustn't", 'very', 'y', "shan't", 'by', 'll', 'those', 'hadn', 'ma', 'own', 'should', 'it', 'be', 'up', 'over', "she's", 'both', "needn't", 'to', 'but', 'being', 'only', 'haven', "shouldn't", 'your', 'the', 'are', 'needn', 'there', 'other', 'during', '...', "wasn't", 'between', 'didn', 'below', 'our', 'which', 'd', 'for', 'off', 'such', 'does', 'after'},
        strip_accents='ascii', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
y=traindata.liked #dependent variable

In [13]:
X=vectorizer.fit_transform(traindata.text)

In [14]:
print (X.shape)

(6918, 2011)


In [15]:
print(y.shape)

(6918,)


In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

In [17]:
clf=naive_bayes.MultinomialNB()

In [18]:
clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])

0.9979292333245913

In [21]:
traindata['word_count'] = traindata['text'].apply(lambda x: len(str(x).split(" ")))
traindata[['text','word_count']].head()

Unnamed: 0,text,word_count
0,The Da Vinci Code book is just awesome.,8
1,this was the first clive cussler i've ever rea...,23
2,i liked the Da Vinci Code a lot.,8
3,i liked the Da Vinci Code a lot.,8
4,I liked the Da Vinci Code but it ultimatly did...,15


In [22]:
#number of word count
traindata['char_count'] = traindata['text'].str.len() ## this also includes spaces
traindata[['text','char_count']].head()

Unnamed: 0,text,char_count
0,The Da Vinci Code book is just awesome.,39
1,this was the first clive cussler i've ever rea...,124
2,i liked the Da Vinci Code a lot.,32
3,i liked the Da Vinci Code a lot.,32
4,I liked the Da Vinci Code but it ultimatly did...,72


In [23]:
# we simply take the sum of the length of all the words and divide it by the total length of the text
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

traindata['avg_word'] = traindata['text'].apply(lambda x: avg_word(x))
traindata[['text','avg_word']].head()

Unnamed: 0,text,avg_word
0,The Da Vinci Code book is just awesome.,4.0
1,this was the first clive cussler i've ever rea...,4.434783
2,i liked the Da Vinci Code a lot.,3.125
3,i liked the Da Vinci Code a lot.,3.125
4,I liked the Da Vinci Code but it ultimatly did...,3.866667


In [24]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

traindata['stopwords'] = traindata['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
traindata[['text','stopwords']].head()

Unnamed: 0,text,stopwords
0,The Da Vinci Code book is just awesome.,2
1,this was the first clive cussler i've ever rea...,8
2,i liked the Da Vinci Code a lot.,3
3,i liked the Da Vinci Code a lot.,3
4,I liked the Da Vinci Code but it ultimatly did...,6


In [25]:
#removing special characters like #
traindata['hastags'] = traindata['text'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
traindata[['text','hastags']].head(6)

Unnamed: 0,text,hastags
0,The Da Vinci Code book is just awesome.,0
1,this was the first clive cussler i've ever rea...,0
2,i liked the Da Vinci Code a lot.,0
3,i liked the Da Vinci Code a lot.,0
4,I liked the Da Vinci Code but it ultimatly did...,0
5,that's not even an exaggeration ) and at midni...,0


In [26]:
#are any numbers present in text data
traindata['numerics'] = traindata['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
traindata[['text','numerics']].head()

Unnamed: 0,text,numerics
0,The Da Vinci Code book is just awesome.,0
1,this was the first clive cussler i've ever rea...,0
2,i liked the Da Vinci Code a lot.,0
3,i liked the Da Vinci Code a lot.,0
4,I liked the Da Vinci Code but it ultimatly did...,0


In [27]:
traindata['upper'] = traindata['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
traindata[['text','upper']].head()

Unnamed: 0,text,upper
0,The Da Vinci Code book is just awesome.,0
1,this was the first clive cussler i've ever rea...,0
2,i liked the Da Vinci Code a lot.,0
3,i liked the Da Vinci Code a lot.,0
4,I liked the Da Vinci Code but it ultimatly did...,1


In [28]:
#The first pre-processing step which we will do is transform our text into lower case.
#This avoids having multiple copies of the same words. For example,
#while calculating the word count, ‘Analytics’ and ‘analytics’ will be taken as different words.
traindata['text'] = traindata['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
traindata['text'].head()

0              the da vinci code book is just awesome.
1    this was the first clive cussler i've ever rea...
2                     i liked the da vinci code a lot.
3                     i liked the da vinci code a lot.
4    i liked the da vinci code but it ultimatly did...
Name: text, dtype: object

In [None]:
# removing the stopwords

In [29]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
traindata['text'] = traindata['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
traindata['text'].head()

0                          da vinci code book awesome.
1    first clive cussler i've ever read, even books...
2                             liked da vinci code lot.
3                             liked da vinci code lot.
4         liked da vinci code ultimatly seem hold own.
Name: text, dtype: object

In [30]:
freq = pd.Series(' '.join(traindata['text']).split()).value_counts()[:10]
freq

harry         2088
vinci         2001
da            1998
brokeback     1996
code          1628
love          1540
mountain      1208
mission       1089
potter         968
impossible     906
dtype: int64

In [31]:
freq = pd.Series(' '.join(traindata['text']).split()).value_counts()[-10:]
freq

backward      1
phone         1
dvds          1
afraid        1
really,       1
queens..      1
south         1
ass!.-sean    1
nerd          1
more!         1
dtype: int64

In [32]:
!pip install textblob
from textblob import TextBlob
traindata['text'][:5].apply(lambda x: str(TextBlob(x).correct()))



You are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


0                          da since code book awesome.
1    first live cutler i've ever read, even books l...
2                             liked da since code lot.
3                             liked da since code lot.
4        liked da since code ultimately seem hold own.
Name: text, dtype: object

In [33]:
TextBlob(traindata['text'][1]).words

WordList(['first', 'clive', 'cussler', 'i', "'ve", 'ever', 'read', 'even', 'books', 'like', 'relic', 'da', 'vinci', 'code', 'plausible', 'this'])

In [34]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
traindata['text'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0                          da vinci code book awesome.
1    first clive cussler i'v ever read, even book l...
2                              like da vinci code lot.
3                              like da vinci code lot.
4          like da vinci code ultimatli seem hold own.
Name: text, dtype: object

In [35]:
from textblob import Word
traindata['text'] = traindata['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
traindata['text'].head()

0                          da vinci code book awesome.
1    first clive cussler i've ever read, even book ...
2                             liked da vinci code lot.
3                             liked da vinci code lot.
4         liked da vinci code ultimatly seem hold own.
Name: text, dtype: object