In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
print(os.listdir("../input"))


In [2]:
df=pd.read_csv("../input/spam.csv",encoding = "ISO-8859-1")

In [3]:
df.info()

In [4]:
df.describe()

In [5]:
df.head()

In [6]:
df=df[["v1","v2"]]

In [7]:
df.head()

**Workflow**

* All small letters
* Delete punktuation
* Delete stopwords
* Translate words into numbers
    * List of all words
    * Translate texts into a countlist of words
* Train/Test split
* Train this feature on classification ham, spam
* Predict on test-set

In [8]:
# All small letters
df["v2"]=df["v2"].apply(lambda x: x.lower())
df.head()

In [9]:
# delete punctuation
import string

def del_punctuation(text):
    answer=[]
    for letter in text:
        if letter not in string.punctuation:
            answer.append(letter)
    answer="".join(answer)
    return answer

df["v2"]=df["v2"].apply(lambda x: del_punctuation(x))
df.head()

In [10]:
# Now delete stopwords
from nltk.corpus import stopwords

def del_stopwords(text):
    answer=[]
    for word in text.split():
        if word not in stopwords.words("english"):
            answer.append(word)
    answer=" ".join(answer)
    return answer

df["v2"]=df["v2"].apply(lambda x: del_stopwords(x))
df.head()

single letters are not deleted
numbers are not deleted
colloquial language is not deleted (u dun = you don't)

In [11]:
"""#Translate words into numbers
#    * List of all words
#    * Translate texts into a countlist of words
list_of_words=[]
for item in df["v2"]:
    for word in item.split():
        if word not in list_of_words:
            list_of_words.append(word)
print(len(list_of_words))
print(list_of_words[0:4])"""

# This is being done by a sklearn feature (see below)

9431 words in total in all messages
Now lets translate messages into vectors of dimension 1x9431 of count of words (0 if a certain word in the corpus is not in the message, 1 if it is 1 time in the message, 2 if it is in two times etc.)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer().fit(df["v2"])

In [13]:
vectorizer.vocabulary_

In [14]:
len(vectorizer.vocabulary_)

In [15]:
# different from my number calculated above. It ignores 1-letter figures and "words" automatically
test_message=vectorizer.transform(df["v2"][0].split())
print(test_message)

In [16]:
# Construct a matrix with the 9376 features 
X=vectorizer.transform(df["v2"])
X

In [17]:
print(X[0])

In [18]:
X.shape

In [19]:
# How many non-zero items are in the 5572*9376 matrix? (52 mn entries) About 0,09% - indeed very sparse matrix
X.nnz

In [20]:
# These word-counts should be measured not in absolute terms but in relative terms within the message in relation to overall corpus frequency
# Tfidf = Term frequency inverse document frequency
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer().fit(X)
X_relative=tfidf_transformer.transform(X)
X_relative

In [21]:
print(X_relative[0])

In [22]:
from sklearn.naive_bayes import MultinomialNB
spam_model=MultinomialNB().fit(X_relative, df["v1"])

In [23]:
spam_model.predict(X_relative)

No test_train_cross-validation split, different models, feature engineering etc.
Also didnt look into and analyze the data.

But the full how-to do a basic classification of text is accomplished.

In [27]:
# Now lets do a proper train_test_split to see if model predicts accurately
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df["v2"],df["v1"],test_size=0.3)

# Pipeline saves a process of instructions that can be used to different datasets
from sklearn.pipeline import Pipeline
pipeline = Pipeline([("bow",CountVectorizer()),          # Creates sparse matrix counting the words
                     ("tfidf",TfidfTransformer()),       # Relative weights
                    ("classifier",MultinomialNB())])     # Fit the model
pipeline.fit(X_train,y_train)
# Returns a fitted pipeline object

In [28]:
predictions=pipeline.predict(X_test)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))