# Natural Language processing (dataset: 20 Newsgroups)
- Stem word: `Speak` for speaking, spoken, speaks, etc. These words should be treated as same, by the ML Model.
    - Stemming: Crop `ing` from speak, etc.
    - Lemmatization: Find dictionary equivalent of the words.
- `n_jobs = -1`: Use parallel processing

In [None]:
import nltk
from sklearn.datasets import fetch_20newsgroups             # 20 categories of news
import numpy as np
from collections import defaultdict                         # Does not throw KeyError, unlike dict = {}
from nltk.stem import WordNetLemmatizer
from nltk.corpus import names                               # 5001 female and 2943 male names
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import timeit

In [None]:
nltk.download('names')
nltk.download('wordnet')                                         # Graph of words

groups = fetch_20newsgroups()
data_train = fetch_20newsgroups(subset='train', random_state=21) # This subset contains the training data.
train_label = data_train.target                                  # Returns the label
data_test = fetch_20newsgroups(subset='test', random_state=21)   # This subset contains the testing data.
test_label = data_test.target
len(data_train.data), len(data_test.data), len(test_label)
np.unique(test_label)

all_names = names.words()
WNL = WordNetLemmatizer()

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
def clean(data):
  cleaned = defaultdict(list)
  count = 0
  for group in data:
     for words in group.split():
        if words.isalpha() and words not in all_names:
            cleaned[count].append(WNL.lemmatize(words.lower()))
     cleaned[count] = ' '.join(cleaned[count])
     count +=1
  return(list(cleaned.values()))

x_train = clean(data_train.data)
x_test = clean(data_test.data)
tf = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = tf.fit_transform(x_train)
X_test = tf.transform(x_test)
X_train.shape, X_test.shape

((11314, 1000), (7532, 1000))

In [None]:
svc_lib = SVC(kernel = 'linear')
parameters = {'C' : (0.5,1.0,10,100)}
grid_search1 = GridSearchCV(svc_lib, parameters, n_jobs = -1, cv = 3)

start_time = timeit.default_timer()
grid_search1.fit(X_train, train_label)
final = timeit.default_timer()-start_time

In [None]:
print("Execution Time : ",final)
print(grid_search1.best_params_)
print(grid_search1.best_score_)
grid_search_best1 = grid_search1.best_estimator_
accuracy = grid_search_best1.score(X_test, test_label)
print(accuracy)

Execution Time :  240.55116530200007
{'C': 1.0}
0.7209652808886706
0.6274561869357408
