In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer



# Data Preprocessing

In [2]:
train=pd.read_csv('D:/Projects/Stride.Ai/enron_train.txt',delimiter="\t",header=None, names=["Intent", "Sentence"])
test=pd.read_csv('D:/Projects/Stride.Ai/enron_test.txt',delimiter="\t",header=None, names=["Intent", "Sentence"])

In [3]:
train.head()

Unnamed: 0,Intent,Sentence
0,No,>>> [1]Contact Me Now to Make $100 Today!$LINK
1,No,Act now to keep your life on the go!
2,No,Choose between $500 and $10000 dollars with up...
3,No,Click above to earn today.
4,No,Click here to receive your first $10 today:


In [4]:
test.shape

(992, 2)

In [5]:
train.shape

(3657, 2)

In [6]:
test.shape

(992, 2)

In [7]:
import matplotlib.pyplot as plt


In [8]:
def avg_no_of_words_in_sentences(df):
    #print(x.split(' '))
    
    no_of_words=[len(x.split(' ')) for x in df.Sentence ]
    
    return np.median(no_of_words)

In [9]:
avg_no_of_words_in_sentences(train)

14.0

In [10]:
y_train=train['Intent']
x_train=train.drop('Intent',axis=1)
y_test=test['Intent']
x_test=test.drop('Intent',axis=1)

In [11]:
type(x_train)

pandas.core.frame.DataFrame

In [12]:
x_train.head()

Unnamed: 0,Sentence
0,>>> [1]Contact Me Now to Make $100 Today!$LINK
1,Act now to keep your life on the go!
2,Choose between $500 and $10000 dollars with up...
3,Click above to earn today.
4,Click here to receive your first $10 today:


# Make Lower case:

In [13]:
s_list1=[]
for s in x_train['Sentence']:
    s=s.lower()
    s_list1.append(s)



In [14]:
x_train.drop('Sentence',axis=1)
x_train['Sentence']=s_list1

In [15]:
x_train.head()

Unnamed: 0,Sentence
0,>>> [1]contact me now to make $100 today!$link
1,act now to keep your life on the go!
2,choose between $500 and $10000 dollars with up...
3,click above to earn today.
4,click here to receive your first $10 today:


In [16]:
s_list2=[]
for s in x_test['Sentence']:
    s=s.lower()
    s_list2.append(s)
x_test.drop('Sentence',axis=1)
x_test['Sentence']=s_list2

In [17]:
x_test.head()

Unnamed: 0,Sentence
0,i look forward to meeting you and learning abo...
1,we look forward to seeing you next week!
2,a quick question before our meeting.
3,after sunning and drinking all day we feasted ...
4,"also tuesday, kyle and eric 1/2 hour."


# Cleaning the Data

* Stemming
* Stop words removal
* Lowering
* Tokenization
* Pruning (numbers and punctuation)

In [18]:
from nltk.corpus import stopwords
from string import punctuation
punc = set(punctuation)
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
from nltk.tokenize import wordpunct_tokenize

For the training data

In [19]:
tokenized1 = []
for s in x_train['Sentence']:
    tokens = nltk.word_tokenize(s)
    tokenized1.append(tokens)


For the test data

In [20]:
tokenized2 = []
for s in x_test['Sentence']:
    tokens = nltk.word_tokenize(s)
    tokenized2.append(tokens)

# Stop words

In [21]:
stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}


In [22]:
stopwords_json_en = set(stopwords_json['en'])
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)
spe = ["''","``"]
spe = set(spe)
# Combine the stopwords. Its a lot longer so I'm not printing it out...
stop_pun = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct,spe)

# Removing Stop words

In [23]:
for sent1 in tokenized1:
    for word1 in sent1:
        if word1 in stop_pun:
            sent1.remove(word1)
        if type(word1)==type(5) or type(word1)==type(5.0): # if word in sentence is an 'integer or float then also remove it'
            #print(i)
            sent1.remove(word1)

In [24]:
for sent2 in tokenized2:
    for word2 in sent2:
        if word2 in stop_pun:
            sent2.remove(word2)
        if type(word2)==type(5) or type(word2)==type(5.0): # if word in sentence is an 'integer or float then also remove it'
            #print(i)
            sent2.remove(word)

In [25]:
x_train['Sentence']=tokenized1
x_train.head()

Unnamed: 0,Sentence
0,"[>, [, 1, contact, now, make, 100, today, $, l..."
1,"[act, to, your, life, the, !]"
2,"[choose, $, 500, $, 10000, dollars, up, 5, yea..."
3,"[click, to, earn, today]"
4,"[click, to, receive, first, 10, today]"


In [26]:
x_train.shape

(3657, 1)

In [27]:
y_train.shape

(3657,)

In [28]:
x_test['Sentence']=tokenized2
x_test.head()

Unnamed: 0,Sentence
0,"[look, forward, meeting, and, learning, your, ..."
1,"[look, forward, seeing, next, week]"
2,"[quick, question, our, meeting]"
3,"[sunning, drinking, day, feasted, a, fleisch, ..."
4,"[tuesday, kyle, eric, 1/2, hour]"


In [112]:
x_test.shape

(992, 1)

In [113]:
y_test.shape

(992,)

In [114]:
def list_to_str(x):
    y=str(x)
    y1=y.replace('[','')
    y2=y1.replace(']','')
    y3=y2.replace(",","")
    y4=y3.replace("'","")
    return y4

In [115]:
x_train['Sentence1']=x_train[['Sentence']].applymap(lambda x :list_to_str(x))
x_test['Sentence1']=x_test[['Sentence']].applymap(lambda x :list_to_str(x))

In [116]:
x_train1=x_train.drop('Sentence',axis=1)
x_test1=x_test.drop('Sentence',axis=1)


In [117]:
x_test1.head()

Unnamed: 0,Sentence1
0,look forward meeting and learning your success...
1,look forward seeing next week
2,quick question our meeting
3,sunning drinking day feasted a fleisch dinner ...
4,tuesday kyle eric 1/2 hour


In [118]:
train_data=x_train1['Sentence1']
test_data=x_test1['Sentence1']

In [119]:
count_vect = CountVectorizer()
x_train_final=count_vect.fit_transform(train_data)  
x_test_final=count_vect.transform(test_data)

In [120]:
tfidf_transformer=TfidfTransformer()

x_train_tfidf=tfidf_transformer.fit_transform(x_train_final)
x_test_tfidf=tfidf_transformer.fit_transform(x_test_final)

In [121]:
type(x_train_tfidf)

scipy.sparse.csr.csr_matrix

# Applying Multinomial Naïve Bayes learning method

In [122]:
prediction=dict()
from sklearn.naive_bayes import MultinomialNB

model=MultinomialNB().fit(x_train_tfidf, y_train)
prediction['Multinomial']=model.predict(x_test_tfidf)

In [123]:
len(prediction['Multinomial'])

992

In [124]:
from sklearn.metrics import accuracy_score

print('Accuracy = {}'.format(accuracy_score(prediction['Multinomial'], y_test) * 100))


Accuracy = 67.43951612903226


# Applying SVM

In [125]:
from sklearn.svm import SVC
model=SVC()
model.fit(x_train_tfidf,y_train)

y_pred=model.predict(x_test_tfidf)

accuracy=accuracy_score(y_pred,y_test)*100
accuracy

68.8508064516129

# Logistic Regression

In [126]:
from sklearn.linear_model import LogisticRegression
lrc=LogisticRegression()
lrc.fit(x_train_tfidf,y_train)

y_pred=lrc.predict(x_test_tfidf)

accuracy=accuracy_score(y_pred,y_test)*100
accuracy

76.61290322580645

# Random Forest

In [127]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=500,min_samples_leaf=5)
model.fit(x_train_tfidf,y_train)

y_pred=model.predict(x_test_tfidf)

accuracy=accuracy_score(y_pred,y_test)*100
accuracy

74.79838709677419

# MLP Classifier

In [128]:
from sklearn.neural_network import MLPClassifier
model=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
model.fit(x_train_tfidf,y_train)

y_pred=model.predict(x_test_tfidf)

accuracy=accuracy_score(y_pred,y_test)*100
accuracy

67.13709677419355