In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r"D:\email-dataset.csv",encoding="ISO-8859-1")

In [3]:
df=df[['v1','v2']]

In [4]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.text=df.text.astype(str)
df.sample(5)

Unnamed: 0,target,text
5285,0,"Subject: re : copper curve tani , no problem..."
889,1,"Subject: perfect logo charset = koi 8 - r "" > ..."
2563,0,Subject: from the enron india newsdesk - may 5...
3014,0,"Subject: colleagues , i will be leaving enron..."
1138,1,Subject: failed mail your message to mxo 0 . ...


In [5]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [6]:
df.head()

Unnamed: 0,target,text
0,3,Subject: naturally irresistible your corporate...
1,3,Subject: the stock trading gunslinger fanny i...
2,3,Subject: unbelievable new homes made easy im ...
3,3,Subject: color printing special request addi...
4,3,"Subject: do not have money , get software cds ..."


In [7]:
df = df.drop_duplicates(keep='first')

In [8]:
df.duplicated().sum()

0

In [9]:
df.shape

(5698, 2)

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Anandhan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in nltk.corpus.stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for word in text:
        y.append(ps.stem(word))
    
            
    return " ".join(y)

In [14]:
transform_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

'gon na home soon want talk stuff anymor tonight k cri enough today'

In [13]:
df['transformed_text'] = df['text'].apply(transform_text)

In [15]:
df['transformed_text']

0       subject natur irresist corpor ident lt realli ...
1       subject stock trade gunsling fanni merril muzo...
2       subject unbeliev new home made easi im want sh...
3       subject color print special request addit info...
4       subject money get softwar cd softwar compat gr...
                              ...                        
5729    subject research develop charg gpg forward shi...
5730    subject receipt visit jim thank invit visit ls...
5731    subject enron case studi updat wow day super t...
5732    subject interest david pleas call shirley cren...
5733    subject news aurora 5 2 updat aurora version 5...
Name: transformed_text, Length: 5698, dtype: object

In [16]:
df

Unnamed: 0,target,text,transformed_text
0,3,Subject: naturally irresistible your corporate...,subject natur irresist corpor ident lt realli ...
1,3,Subject: the stock trading gunslinger fanny i...,subject stock trade gunsling fanni merril muzo...
2,3,Subject: unbelievable new homes made easy im ...,subject unbeliev new home made easi im want sh...
3,3,Subject: color printing special request addi...,subject color print special request addit info...
4,3,"Subject: do not have money , get software cds ...",subject money get softwar cd softwar compat gr...
...,...,...,...
5729,2,Subject: re : research and development charges...,subject research develop charg gpg forward shi...
5730,2,"Subject: re : receipts from visit jim , than...",subject receipt visit jim thank invit visit ls...
5731,2,Subject: re : enron case study update wow ! a...,subject enron case studi updat wow day super t...
5732,2,"Subject: re : interest david , please , call...",subject interest david pleas call shirley cren...


In [17]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [18]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [19]:
X.shape

(5698, 3000)

In [20]:
y = df['target'].values

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [25]:
svc = svm.SVC(kernel='sigmoid', gamma=1.0)

In [26]:
knc = KNeighborsClassifier()
lrc = LogisticRegression(solver='liblinear', penalty='l1')
dtc = DecisionTreeClassifier(max_depth=5)
mnb = MultinomialNB()

In [29]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred,pos_label=2,average='micro')
    
    return accuracy,precision

In [30]:
train_classifier(lrc,X_train,y_train,X_test,y_test)



(0.9710526315789474, 0.9710526315789474)

In [31]:
train_classifier(knc,X_train,y_train,X_test,y_test)



(0.974561403508772, 0.974561403508772)

In [29]:
train_classifier(dtc,X_train,y_train,X_test,y_test)

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [32]:
train_classifier(mnb,X_train,y_train,X_test,y_test)



(0.9859649122807017, 0.9859649122807017)

In [33]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))