In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import os

In [2]:
DATA_PATH = r"data/"

In [3]:
def read_file(path):
    with open(path, encoding='iso-8859-1') as handler:
        return handler.read().split('\n\n', 1)[-1].strip()

In [4]:
def load_data(path):
    dictionary = {
        'Message': [],
        'Class': []
    }

    for category in os.listdir(path):
        category_path = os.path.join(path, category)

        for file in os.listdir(category_path):
            file_path = os.path.join(category_path, file)
            message = read_file(file_path)

            dictionary['Message'].append(message)            
            dictionary['Class'].append(category)

    return pd.DataFrame(dictionary)

In [5]:
df = load_data(DATA_PATH)
df.head()

Unnamed: 0,Message,Class
0,"Date: Wed, 21 Aug 2002 10:54:46 -0500\n...",normal
1,"Martin A posted:\nTassos Papadopoulos, the Gre...",normal
2,Man Threatens Explosion In Moscow \n\nThursday...,normal
3,Klez: The Virus That Won't Die\n \nAlready the...,normal
4,"> in adding cream to spaghetti carbonara, whi...",normal


In [6]:
df['Target'] = np.where(df['Class'] == 'spam', 1, 0)
df.tail()

Unnamed: 0,Message,Class,Target
2995,<html>\n<head>\n<title>Toy</title>\n</head>\n<...,spam,1
2996,<html>\n<head>\n<title>Untitled Document</titl...,spam,1
2997,This is an HTML email message. If you see thi...,spam,1
2998,"<html>\n<head>\n</head>\n <body background=""h...",spam,1
2999,"<STYLE type=""text/css"">\n<!--\nP{\n font-size...",spam,1


In [7]:
X = df['Message']
y = df['Target']

In [8]:
vectorizer = CountVectorizer().fit(X)
X = vectorizer.transform(X)

In [9]:
classifier = DecisionTreeClassifier()
classifier.fit(X, y)

DecisionTreeClassifier()

In [10]:
sentences = [
    'Hello dear friend, I think you would like this new product.',
    'Congratulation my classmate! I am very happy to see your new marks',
    """Have you ever tried to do things automatically? 
    watch this new unveil of the new offer for our company with its new participant right now 
    our goal is to make our dear customers comfortable & happy
    you can use bellow discount-codes (specially for you)"""
]

test_data = vectorizer.transform(sentences)

In [11]:
classifier.predict(test_data)

array([1, 0, 1])