## Load data

In [1]:
filename_multi = "C:\Data\some_data.txt"

In [None]:
import pandas as pd
data_multi = pd.read_csv(filename_multi, quotechar='"', delimiter='\t')

In [None]:
data_multi.Label[0:4]

In [None]:
data_multi.Sentence[0:4]

## Vectorise text data into numerical vector

### Using `TfidfVectorizer`

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1,1), max_df=0.9, min_df=1)
tfidf_vectors = tfidf_vectorizer.fit_transform(data_multi.Sentence)

In [None]:
tfidf_vectorizer.vocabulary_

In [None]:
tfidf_vectors.shape

In [None]:
tfidf_vectors.data

In [None]:
print(tfidf_vectors.nnz) # number of non-zero elements
print(tfidf_vectors.nnz / float(tfidf_vectors.shape[0])) # non-zero per sample
print(tfidf_vectors.nnz / float(tfidf_vectors.shape[0] * tfidf_vectors.shape[1]) * 100) # non-zero per feature space in %

### Using `CountVectorizer`

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(analyzer='word', stop_words='english', ngram_range=(1,1), max_df=0.9, min_df=1)
tf_vectors = tf_vectorizer.fit_transform(data_multi.Sentence)

In [None]:
tf_vectorizer.vocabulary_

In [None]:
tf_vectors.shape

In [None]:
tf_vectors.data

In [None]:
print(tf_vectors.nnz) # number of non-zero elements
print(tf_vectors.nnz / float(tf_vectors.shape[0])) # non-zero per sample
print(tf_vectors.nnz / float(tf_vectors.shape[0] * tf_vectors.shape[1]) * 100) # non-zero per feature space in %

## Data Preprocessing: 
### Attributes and Target

`X` is data, `y` is target. `X` has the size of `n_samples` x `n_features`, `y` has `n_samples` x 1 labels

In [None]:
X_tfidf = tfidf_vectors
X_tf = tf_vectors

### Convert categorical labels into numerical labels

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
le.fit(data_multi.Label)
list(le.classes_)

In [None]:
y = le.transform(data_multi.Label) 
print(y[0:5])
list(le.inverse_transform(y[0:5]))

## Data Visualization: preliminary feature space inspection

### Using PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(2)  # project from 8053 to 2 dimensions
X_tf_proj = pca.fit_transform(X_tf.toarray()) #Fit the model with X and apply the dimensionality reduction on X.
print(X_tf.shape)
print(X_tf_proj.shape)

In [None]:
print(pca.explained_variance_)
print(pca.components_)

In [None]:
pca = PCA(2)  # project from 8053 to 2 dimensions
X_tfidf_proj = pca.fit_transform(X_tfidf.toarray()) #Fit the model with X and apply the dimensionality reduction on X.
print(X_tfidf.shape)
print(X_tfidf_proj.shape)

In [None]:
print(pca.explained_variance_)
print(pca.components_)

### Plot features space of reduced dimension

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(X_tf_proj[:, 0], X_tf_proj[:, 1], c=y, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('nipy_spectral', 4))
plt.colorbar();

**Note:** By visual inspection, it does not look like there's natural seperation of the 4 classes.

In [None]:
plt.scatter(X_tfidf_proj[:, 0], X_tfidf_proj[:, 1], c=y, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('nipy_spectral', 4))
plt.colorbar();

## Data splitting

**Either:** Split data into train and test set. **Or:** load readily split data.

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=8) # random_state is fixed value for reproducibility
X_tf_train, X_tf_test, y_train, y_test = train_test_split(X_tf, y, test_size=0.33)
X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.33) # In this case, y in both cases are the same

In [None]:
print(X_tf_train.shape)
print(X_tf_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
print(X_tfidf_train.shape)
print(X_tfidf_test.shape)
print(y_train.shape)
print(y_test.shape)

## Multinomail Naive Bayesian:
### Model Fitting

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [None]:
clf_tf = MultinomialNB(alpha=0.1)
clf_tf.fit(X_tf_train,y_train)

In [None]:
clf_tfidf = MultinomialNB(alpha=0.1)
clf_tfidf.fit(X_tf_train,y_train)

In [None]:
print(clf_tf.class_count_, clf_tf.classes_)

In [None]:
print(clf_tfidf.class_count_, clf_tfidf.classes_)

In [None]:
print(clf_tf.coef_)# how to interprete?

In [None]:
print(clf_tfidf.coef_)# how to interprete?

In [None]:
clf_tf.feature_count_

In [None]:
clf_tfidf.feature_count_

In [None]:
clf_tf.feature_log_prob_ # how to interprate? Same as clf_tf.coef_

In [None]:
clf_tfidf.feature_log_prob_ # how to interprate? Same as clf_tfidf.coef_

## Multinomial Naive Bayesian:
### Classification

** Term Frequency** 

** Note:** Not too impressive.

In [None]:
y_tf_pred = clf_tf.predict(X_tf_test)

In [None]:
 print(metrics.classification_report(y_test, y_tf_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_tf_pred))

** Term Frequency - Inverse Document Frequency ** 

** Note:** Not too impressive.

In [None]:
y_tfidf_pred = clf_tfidf.predict(X_tfidf_test)

In [None]:
 print(metrics.classification_report(y_test, y_tfidf_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_tfidf_pred))

## SVM: 
### 1 of 2: linear kernal

When classes are imbalanced, set `class_weight` to `'balanced'`.

In [None]:
from sklearn.svm import SVC # "Support Vector Classifier"
clf_svm_linear = SVC(kernel='linear', class_weight = 'balanced')
clf_svm_linear.fit(X_tfidf_train, y_train)

In [None]:
y_svm_linear_pred = clf_svm_linear.predict(X_tfidf_test)

In [None]:
 print(metrics.classification_report(y_test, y_svm_linear_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_svm_linear_pred))

### 2 of 2: Radial Basis Function Kernel

In [None]:
clf_svm_rbf = SVC(kernel='rbf', class_weight='balanced')
clf_svm_rbf.fit(X_tfidf_train, y_train)

In [None]:
y_svm_rbf_pred = clf_svm_rbf.predict(X_tfidf_test)

In [None]:
 print(metrics.classification_report(y_test, y_svm_rbf_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_svm_rbf_pred))

**Note:** The above metrics look suspecious.

## k-NN:

In [None]:
from sklearn.neighbors import KNeighborsClassifier

`k = 3, weights = 'uniform'` 

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, weights='uniform')
clf_knn = knn.fit(X_tfidf_train, y_train)
y_knn_pred = clf_knn.predict(X_tfidf_test)

In [None]:
 print(metrics.classification_report(y_test, y_knn_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_knn_pred))

`k = 3, weights = 'distance'` 

In [None]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
clf_knn = knn.fit(X_tfidf_train, y_train)
y_knn_pred = clf_knn.predict(X_tfidf_test)

In [None]:
 print(metrics.classification_report(y_test, y_knn_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_knn_pred))

`k = 5, weights = 'distance'` 

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
clf_knn = knn.fit(X_tfidf_train, y_train)
y_knn_pred = clf_knn.predict(X_tfidf_test)

In [None]:
 print(metrics.classification_report(y_test, y_knn_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_knn_pred))

k = 5, weights = 'distance' 

In [None]:
knn = KNeighborsClassifier(n_neighbors=7, weights='distance')
clf_knn = knn.fit(X_tfidf_train, y_train)
y_knn_pred = clf_knn.predict(X_tfidf_test)

In [None]:
 print(metrics.classification_report(y_test, y_knn_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_knn_pred))