In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import string
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import tree

%matplotlib inline

# Load  data

In [31]:
df = pd.read_csv('../datasets/spam.csv', encoding='latin-1')
df = df.loc[:,['v1','v2']]
df.tail()

Unnamed: 0,v1,v2
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [32]:
d={'spam':1,'ham':0}
df.v1 = list(map(lambda x:d[x],df.v1))

In [None]:
import nltk
nltk.download("punkt")

# From Text To Features

In [6]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

class stemmed_tfidf():
    def __init__(self,max_features=5000):
        self.ps = PorterStemmer()
        self.vc = TfidfVectorizer(analyzer='word',#{‘word’, ‘char’}  Whether the feature should be made of word or character n-grams
                             stop_words = 'english',
                             max_features = max_features)
    def tfidf(self,ListStr):
        '''
        return: sklearn.feature_extraction.text.TfidfVectorizer
        '''
        table = self.vc.fit_transform([self.stem_string(s) for s in ListStr])
        return table
    def stem_string(self,s):
        '''
        s:str, e.g. s = "Get strings with string. With. Punctuation?"
        ps: stemmer from nltk module
        return: bag of words.e.g. 'get string with string with punctuat'
        '''    
        s = re.sub(r'[^\w\s]',' ',s)# remove punctuation.
        tokens = word_tokenize(s) # list of words.
        #a = [w for w in tokens if not w in stopwords.words('english')]# remove common no meaning words
        return ' '.join([self.ps.stem(w) for w in tokens])# e.g. 'desks'->'desk'

In [51]:
stf = stemmed_tfidf(max_features=10000)
feature = stf.tfidf(df.v2) # this will be a sparse matrix of size (n,5000)

In [52]:
Xtrain, Xtest, ytrain, ytest = train_test_split(feature, df.v1, test_size=0.2, random_state=1)
print(Xtest[4])
# print(ytest.to_list())

def write_to_file(x, y, instance_num=10, file_path="../datasets/test/test.data1"):
    assert(len(x) >= instance_num and len(x) == len(y))
    x_slice, y_slice = x[:instance_num], y[:instance_num]
    print(x_slice)
    print(f"x dimension: {len(x[0])}")
    print(y_slice)
    with open(file_path, "w") as f:
        # write header
        features = list(range(len(x[0])))
        feature_name = ",".join([str(c) for c in features])
        f.write(f"label, {feature_name}\n")

        # write data
        for i in range(instance_num):
            data_features = ",".join([str(c) for c in x_slice[i]])
            f.write(f"{y_slice[i]},{data_features}\n")
    f.close()

write_to_file(Xtest.toarray(), ytest.to_list())

  (0, 1796)	0.30434679674817583
  (0, 6694)	0.29053191490659164
  (0, 5747)	0.28073010025130146
  (0, 5841)	0.2531003365681329
  (0, 4457)	0.16689339663532266
  (0, 5975)	0.3163271775444133
  (0, 1241)	0.17740612183932283
  (0, 6810)	0.20143581358576368
  (0, 5468)	0.23143112388098888
  (0, 6730)	0.4423225846361562
  (0, 4214)	0.16659320358399735
  (0, 3240)	0.15141136571437888
  (0, 1891)	0.18321007649041401
  (0, 350)	0.18225024707750948
  (0, 6238)	0.2918822636605647
  (0, 3071)	0.16812133821189043
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
x dimension: 7110
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]


# Metrics

In [30]:
Acc = {}
F1score = {}
confusion_mat={}
predictions = {}

# Select training parameter

In [45]:
val_scores = []
for i in range(2,21):
    DT = DecisionTreeClassifier(min_samples_split=i, max_depth=5, random_state=1,class_weight='balanced')
    scores = cross_val_score(DT, Xtrain, ytrain,scoring='f1')
    val_scores.append([np.mean(scores),i])
print(val_scores.index(max(val_scores)))
val_scores = np.array(val_scores)
print('The best scores happens on:',val_scores[val_scores[:,0]==max(val_scores[:,0]),1:],
      ', where F1 =',val_scores[val_scores[:,0]==max(val_scores[:,0]),0])

18
The best scores happens on: [[ 2.]
 [ 3.]
 [ 4.]
 [ 5.]
 [ 6.]
 [ 7.]
 [ 8.]
 [ 9.]
 [10.]
 [11.]
 [12.]
 [13.]
 [14.]
 [15.]
 [16.]
 [17.]
 [18.]
 [19.]
 [20.]] , where F1 = [0.68825422 0.68825422 0.68825422 0.68825422 0.68825422 0.68825422
 0.68825422 0.68825422 0.68825422 0.68825422 0.68825422 0.68825422
 0.68825422 0.68825422 0.68825422 0.68825422 0.68825422 0.68825422
 0.68825422]


In [36]:
name = 'DT'
DT = DecisionTreeClassifier(min_samples_split=2, max_depth=5, random_state=1,class_weight='balanced')
DT.fit(Xtrain,ytrain)
pred = DT.predict(Xtest.toarray())
F1score[name]= f1_score(ytest,pred)
Acc[name] = accuracy_score(ytest,pred)
confusion_mat[name] = confusion_matrix(ytest,pred)
predictions[name]=pred
print(name+': Accuracy=%1.3f, F1=%1.3f'%(Acc[name],F1score[name]))

DT: Accuracy=0.935, F1=0.714


# Visualize the decision tree model

In [37]:
output_file = open("dt.dot", 'w')
dot_data = tree.export_graphviz(
    DT, class_names=['0', '1'], filled=True, rounded=True, out_file=output_file)