In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
training_data = pd.read_csv("./data/cleaned/training_data_cleaned.csv")
testing_data = pd.read_csv("./data/cleaned/validation_data_cleaned.csv")

In [3]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [4]:
xtrain = training_data.text.values
ytrain = training_data.Y.values
xtest = testing_data.text.values
ytest = testing_data.Y.values

In [5]:
print (xtrain.shape)
print(xtest.shape)

(45000,)
(15000,)


In [6]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

In [7]:
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xtest))
xtrain_tfv =  tfv.transform(xtrain) 
xtest_tfv = tfv.transform(xtest)

In [8]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(max_iter=250)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xtest_tfv)
print ("logloss: %0.3f " % multiclass_logloss(ytest, predictions))

logloss: 0.687 


In [9]:
print(f"Validation Accuracy of Logsitic Regression Classifier is: {(clf.score(xtest_tfv, ytest))*100:.2f}%")

Validation Accuracy of Logsitic Regression Classifier is: 69.55%
