## Load data

In [None]:
file_test = 'C:\Data\someData.tsv'
file_train = 'C:\Data\someData.tsv'

In [None]:
import pandas as pd
test = pd.read_csv(file_test, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
train = pd.read_csv(file_train, quotechar='"', delimiter='\t', header=None, names=['Label', 'Text'])
test.head; train.head

## Vectorise text data into numerical vector

### Using `CountVectorizer`

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(analyzer='word', stop_words='english', ngram_range=(1,1), max_df=0.9, min_df=0.01)
train_tf_vectors = tf_vectorizer.fit_transform(train.Text)

** Note: ** Numbers not removed. Consider removing numbers.

`tf_vectorizer.vocabulary_` is a `dict`

In [None]:
tf_vectorizer.vocabulary_

In [None]:
train_tf_vectors.shape

In [None]:
train_tf_vectors.data

In [None]:
print(train_tf_vectors.nnz) # number of non-zero elements
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0])) # non-zero per sample
print(train_tf_vectors.nnz / float(train_tf_vectors.shape[0] * train_tf_vectors.shape[1]) * 100) # non-zero per feature space in %

## Data Preprocessing: 
### Attributes and Target

### Convert categorical labels into numerical labels

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelEncoder()
le.fit(train.Label)
list(le.classes_)

In [None]:
y_train = le.transform(train.Label);
y_test = le.transform(test.Label);

print(y_train[0:5])
print(le.inverse_transform(y_train[0:5]))

`X` is data, `y` is target. `X` has the size of `n_samples` x `n_features`, `y` has `n_samples` x 1 labels

In [None]:
X_train = train_tf_vectors

In [None]:
X_test = tf_vectorizer.transform(test.Text)

In [None]:
X_test.shape

In [None]:
X_test.data

## Log Res: 

In [None]:
from sklearn import linear_model
clf = linear_model.LogisticRegression(C=1e5, class_weight='balanced')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

In [None]:
print(metrics.confusion_matrix(y_test, y_pred))

### Coeficients of the decision function

In [None]:
clf.coef_

In [None]:
len(clf.coef_[0,])

Get the index of coef with the maximum value.

In [None]:
clf.coef_.argmax()

In [None]:
m = clf.coef_.argmax()
print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(m)])

Get the index of the highest 10 coef value.

**Note** `clf.coef_` is ndarray, see below:

In [None]:
clf.coef_[0]

In [None]:
print(clf.coef_[0][0])
print(clf.coef_[0][1])

Get top N words

In [None]:
import numpy as np
topN = np.argsort(clf.coef_[0])[::-1][:20]
print(topN)

In [None]:
for v in topN:
    print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(v)])


Get bottom N words

In [None]:
bottomN = np.argsort(clf.coef_[0])[::1][:20]
print(bottomN)

In [None]:
for v in bottomN:
    print(list(tf_vectorizer.vocabulary_.keys())[list(tf_vectorizer.vocabulary_.values()).index(v)])
