Simple example of creating a classification of spam or ham 
sms with zero data cleaning

In [80]:
import pandas as pd

In [81]:
data = pd.read_csv('smsspamcollection/SMSSpamCollection',sep="\t", header=None, names=['class','text'])

In [82]:
data.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [83]:
data.isna().sum()

class    0
text     0
dtype: int64

In [84]:
# use numeric labels becuase compouters know nums better than text

data.replace(['ham','spam'],[0, 1])

Unnamed: 0,class,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [85]:
X = pd.DataFrame(data['text'])
y = pd.DataFrame(data['class'])

In [86]:
X.head()

Unnamed: 0,text
0,"Go until jurong point, crazy.. Available only ..."
1,Ok lar... Joking wif u oni...
2,Free entry in 2 a wkly comp to win FA Cup fina...
3,U dun say so early hor... U c already then say...
4,"Nah I don't think he goes to usf, he lives aro..."


In [87]:
y.head()

Unnamed: 0,class
0,ham
1,ham
2,spam
3,ham
4,ham


In [88]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import category_encoders as ce

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=23)

In [89]:
data['class'].head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: class, dtype: object

In [90]:
sets = [X_train, X_test, y_train, y_test]

In [91]:
for d in sets:
    print(d.shape)

(4457, 1)
(1115, 1)
(4457, 1)
(1115, 1)


In [92]:
dtc = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    DecisionTreeClassifier(random_state=42),
)
dtc.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('onehotencoder',
                 OneHotEncoder(cols=['text'], drop_invariant=False,
                               handle_missing='value', handle_unknown='value',
                               return_df=True, use_cat_names=True, verbose=0)),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=42,
                                        splitter='best'))],
         verbose=False)

In [93]:
from sklearn.metrics import accuracy_score

In [94]:
print ("Accuracy : %0.5f \n\n" % accuracy_score(y_test, dtc.predict(X_test)))

Accuracy : 0.88789 




In [95]:
X_test.head()

Unnamed: 0,text
637,"Sweetheart, hope you are not having that kind ..."
5352,Good morning princess! Have a great day!
1764,Joy's father is John. Then John is the NAME of...
3639,Customer service announcement. We recently tri...
744,Ok. There may be a free gym about.


In [96]:
y_pred = dtc.predict(X_test)

In [97]:
sub = X_test[['text']]

In [98]:
sub['predict'] = y_pred
sub.head()

Unnamed: 0,text,predict
637,"Sweetheart, hope you are not having that kind ...",ham
5352,Good morning princess! Have a great day!,ham
1764,Joy's father is John. Then John is the NAME of...,ham
3639,Customer service announcement. We recently tri...,ham
744,Ok. There may be a free gym about.,ham
