# Baseline Model


- [x] Logistic Regression + CountVectorizer
- [x] Logistic Regression + TFIDF
- [x] Naive Bayes

In [1]:
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/kurianbenoy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv("../data/all_news.csv")
df.head()

Unnamed: 0,text,labels
0,ഇംഗ്ലീഷ് പ്രീമിയര്‍ ലീഗില്‍ ചെല്‍സിക്ക് ഗംഭീര ...,Sports
1,മുന്നണി വിപുലീകരണം സംബന്ധിച്ച് വ്യാഴാഴ്ച ചേരുന...,Kerala
2,തിരുവമ്പാടി കൃഷ്ണനെ സാക്ഷിയാക്കി നടി ഭാവനയ്ക്ക...,Kerala
3,അടുത്തവര്‍ഷത്തോടെ ഇന്ത്യയിലെ മുഴുവന്‍ ഗ്രാമങ്ങ...,Business
4,പൂര്‍ണമായും ഡോക്ടര്‍മാരുടെ ജീവിതം പ്രമേയമാക്കി...,Entertainment


In [4]:
df["labels"].value_counts()

Kerala           3847
Entertainment    1968
Sports           1061
Gulf             1034
India             881
Business          572
Name: labels, dtype: int64

## Word Tokenizer & logistic regression

In [5]:
count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

In [6]:
%%time
count_vec.fit(df.text)

CPU times: user 15.3 s, sys: 196 ms, total: 15.5 s
Wall time: 15.5 s


In [7]:
%%time
xtrain = count_vec.transform(df.text)
model = linear_model.LogisticRegression()
model.fit(xtrain, df.labels)

CPU times: user 4min 27s, sys: 7.5 s, total: 4min 35s
Wall time: 1min 2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
data = {
    "text": [
        "സ്കൂബാഡൈവിങ്ങ്, സ്നോർക്കേലിങ്ങ്, സ്പീഡ്ബോട്ടിങ്ങ്, സർഫിങ്ങ് തുടങ്ങിയ കടൽവിനോദങ്ങൾക്കു പേരുകേട്ട ബാലിയിൽ പോയിട്ടും ഇതൊന്നും പരീക്ഷിച്ചില്ല.ധൈര്യം വരാത്തതുകൊണ്ടാണ്. ഇപ്പോൾ ആലോചിക്കുമ്പോൾ ഒരുകൈ നോക്കാമായിരുന്നെന്നുോന്നുന്നു. സാരമില്ല, ബാക്കിവെച്ച ആഗ്രഹങ്ങളാണല്ലോ മുന്നോട്ടുനീങ്ങാനുള്ള പ്രേരണ. അവസരങ്ങൾ ഇനിയുമുണ്ടാകുമെന്ന് കരുതുന്നു."
    ],
    "labels": ["sport"],
}
data

{'text': ['സ്കൂബാഡൈവിങ്ങ്, സ്നോർക്കേലിങ്ങ്, സ്പീഡ്ബോട്ടിങ്ങ്, സർഫിങ്ങ് തുടങ്ങിയ കടൽവിനോദങ്ങൾക്കു പേരുകേട്ട ബാലിയിൽ പോയിട്ടും ഇതൊന്നും പരീക്ഷിച്ചില്ല.ധൈര്യം വരാത്തതുകൊണ്ടാണ്. ഇപ്പോൾ ആലോചിക്കുമ്പോൾ ഒരുകൈ നോക്കാമായിരുന്നെന്നുോന്നുന്നു. സാരമില്ല, ബാക്കിവെച്ച ആഗ്രഹങ്ങളാണല്ലോ മുന്നോട്ടുനീങ്ങാനുള്ള പ്രേരണ. അവസരങ്ങൾ ഇനിയുമുണ്ടാകുമെന്ന് കരുതുന്നു.'],
 'labels': ['sport']}

In [9]:
test_df = pd.DataFrame(data)
test_df.head()

Unnamed: 0,text,labels
0,"സ്കൂബാഡൈവിങ്ങ്, സ്നോർക്കേലിങ്ങ്, സ്പീഡ്ബോട്ടിങ...",sport


In [10]:
test_df = df.sample(frac=0.1, random_state=1)
test_df.shape

(936, 2)

In [11]:
xtest = count_vec.transform(test_df.text)

In [12]:
preds = model.predict(xtest)
accuracy = metrics.accuracy_score(test_df.labels, preds)
accuracy

0.9989316239316239

In [13]:
def vectorize_evaluate_loop(train_df, test_df):
    count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    count_vec.fit(train_df.text)
    dependent_train = count_vec.transform(train_df.text)
    model = linear_model.LogisticRegression()
    model.fit(dependent_train, train_df.labels)
    dependent_test = count_vec.transform(test_df.text)
    predictions = model.predict(dependent_test)
    return metrics.accuracy_score(test_df.labels, predictions)

In [14]:
%%time
vectorize_evaluate_loop(df, test_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CPU times: user 4min 37s, sys: 6.69 s, total: 4min 43s
Wall time: 1min 17s


0.9989316239316239

#### K - fold validation

The function to create k-folds for testing training_validation accuracy. It's very important to create [good validation sets](https://www.fast.ai/2017/11/13/validation-sets/).

In [15]:
df["kfold"] = -1
df = df.sample(frac=1).reset_index(drop=True)

In [16]:
df.shape

(9363, 3)

In [17]:
Y_value = df.labels.values
kf = model_selection.StratifiedKFold(n_splits=5)

for fold, (text_, value_) in enumerate(kf.split(X=df, y=Y_value)):
    df.loc[value_, "kfold"] = fold

In [18]:
for fold_ in range(5):
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    print(f"Fold value: {fold_}")
    print(f"Accuracy: {vectorize_evaluate_loop(train_df, test_df)}")

Fold value: 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8873465029364656
Fold value: 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8777362520021356
Fold value: 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8916177255739456
Fold value: 3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.875
Fold value: 4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8669871794871795


## Naive Bayes Classifier

In [19]:
def naive_bayes_evaluate_loop(train_df, test_df):
    count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    count_vec.fit(train_df.text)
    dependent_train = count_vec.transform(train_df.text)
    model = naive_bayes.MultinomialNB()
    model.fit(dependent_train, train_df.labels)
    dependent_test = count_vec.transform(test_df.text)
    predictions = model.predict(dependent_test)
    return metrics.accuracy_score(test_df.labels, predictions)

In [20]:
for fold_ in range(5):
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    print(f"Fold value: {fold_}")
    print(f"Accuracy: {naive_bayes_evaluate_loop(train_df, test_df)}")

Fold value: 0
Accuracy: 0.8441003737319808
Fold value: 1
Accuracy: 0.8227442605445809
Fold value: 2
Accuracy: 0.8296849973304858
Fold value: 3
Accuracy: 0.8269230769230769
Fold value: 4
Accuracy: 0.8183760683760684


## Tfidf Vectorizer

In [21]:
# tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
# tfv.fit(df.text)
# corpus_transformed = tfv.transform(corpus)

In [22]:
def tf_idf_evaluate_loop(train_df, test_df):
    count_vec = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
    count_vec.fit(train_df.text)
    dependent_train = count_vec.transform(train_df.text)
    model = linear_model.LogisticRegression()
    model.fit(dependent_train, train_df.labels)
    dependent_test = count_vec.transform(test_df.text)
    predictions = model.predict(dependent_test)
    return metrics.accuracy_score(test_df.labels, predictions)

In [23]:
for fold_ in range(5):
    train_df = df[df.kfold != fold_].reset_index(drop=True)
    test_df = df[df.kfold == fold_].reset_index(drop=True)
    print(f"Fold value: {fold_}")
    print(f"Accuracy: {tf_idf_evaluate_loop(train_df, test_df)}")

Fold value: 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8478376935397758
Fold value: 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8344901227976508
Fold value: 2
Accuracy: 0.8489054991991457
Fold value: 3
Accuracy: 0.8290598290598291
Fold value: 4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8253205128205128


## Results

I am genuinely surprised by the following:

1. A simple linear regression on this text classification class get's close to 87-89% accuracy
2. The state of art model claims to have got 92% accuracy based on training only on validation dataset, like we have done in sklearn it seems.

![image](https://user-images.githubusercontent.com/24592806/171474436-2e9a25d5-5e62-41a0-b249-e65a2df57585.png)

[Source](https://github.com/adamshamsudeen/Vaaku2Vec/blob/master/train_classifier.ipynb)

3. We haven't done any complex fine tuning or even label_encoding at the moment. Based on improving with some more tweaks, I suspect we maybe able to improve the score even further.