In [1]:
import pandas as pd
%matplotlib inline

In [2]:
df = pd.read_csv('./Datasets/SMSSpamCollection', delimiter='\t',header=None)
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print ('Number of spam messages:', df[df[0] == 'spam'][0].count())
print ('Number of ham messages:', df[df[0] == 'ham'][0].count())

Number of spam messages: 747
Number of ham messages: 4825


In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score

In [5]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0])

from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()

y_train=lb.fit_transform(y_train)[:,0]
y_test=lb.fit_transform(y_test)[:,0]

In [6]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)



In [7]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [9]:
scores = cross_val_score(classifier, X_train, y_train, cv=5)
print (np.mean(scores))

0.952621837664


In [10]:
from sklearn.metrics import accuracy_score
print ('Accuracy:', accuracy_score(y_test, y_pred))




Accuracy: 0.968413496052


# Binary classification performance metrics

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
y_Temp_test = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
y_Temp_pred = [0, 1, 0, 0, 0, 0, 0, 1, 1, 1]

In [None]:
confusion_matrix = confusion_matrix(y_Temp_test, y_Temp_pred)
print(confusion_matrix)

In [None]:
plt.matshow(confusion_matrix)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

## Precision and recall

In [None]:
precisions = cross_val_score(classifier, X_train, y_train,cv=5,scoring='precision')
print ('Precision', np.mean(precisions))

In [None]:
recalls = cross_val_score(classifier, X_train, y_train, cv=5,scoring='recall')
print ('Recalls', np.mean(recalls))

## Calculating the F1 measure

In [None]:
f1s = cross_val_score(classifier, X_train, y_train, cv=5,scoring='f1')
print ('F1=', np.mean(f1s))

## ROC AUC

In [None]:
from sklearn.metrics import roc_curve, auc
predictions = classifier.predict_proba(X_test)

In [None]:
false_positive_rate, recall, thresholds = roc_curve(y_test,predictions[:, 1])

In [None]:
roc_auc = auc(false_positive_rate, recall)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, recall, 'b', label='AUC = %0.2f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out')
plt.show()