# Dataset
The dataset that will be used as a sample in this notebook is the [Twenty Newsgroups Dataset](https://archive.ics.uci.edu/ml/datasets/Twenty+Newsgroups) from the open source UCI Machine Learning Repository. For the purpose of this exercise, a cleaned version of this dataset will be imported from the [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html?highlight=newsgroup#sklearn.datasets.fetch_20newsgroups) API

In [None]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
import pandas as pd

In [None]:
dataset = fetch_20newsgroups(subset = 'all',shuffle= False, random_state=32,remove=('headers', 'footers', 'qutes'))

In [None]:
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [None]:
dataset['data']

In [None]:
pprint(dataset.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [None]:
len(dataset.target_names)

20

In [None]:
dataset.target.shape

(18846,)

In [None]:
dataset.filenames.shape

(18846,)

In [None]:
dataset_df = pd.DataFrame({'News':dataset.data, 'Label' : dataset.target})

In [None]:
dataset_df

Unnamed: 0,News,Label
0,gajarsky@pilot.njin.net writes:\n\nmorgan and ...,9
1,"Well, I just got my Centris 610 yesterday. It...",4
2,Archive-name: cryptography-faq/part10\nLast-mo...,11
3,> ATTENTION: Mac Quadra owners: Many storage i...,4
4,bobbe@vice.ICO.TEK.COM (Robert Beauchaine) wri...,0
...,...,...
18841,\nWhy are circuit boards green? The material ...,12
18842,In article <1r941o$3tu@menudo.uh.edu> inde7wv@...,8
18843,We were told that the resolution on the 5FGe c...,4
18844,CAD Setup For Sale:\n\nG486PLB Local Bus Mothe...,6


In [None]:
dataset_df['Label_name'] = dataset_df['Label'].apply(lambda x: dataset.target_names[x])

In [None]:
dataset_df

Unnamed: 0,News,Label,Label_name
0,gajarsky@pilot.njin.net writes:\n\nmorgan and ...,9,rec.sport.baseball
1,"Well, I just got my Centris 610 yesterday. It...",4,comp.sys.mac.hardware
2,Archive-name: cryptography-faq/part10\nLast-mo...,11,sci.crypt
3,> ATTENTION: Mac Quadra owners: Many storage i...,4,comp.sys.mac.hardware
4,bobbe@vice.ICO.TEK.COM (Robert Beauchaine) wri...,0,alt.atheism
...,...,...,...
18841,\nWhy are circuit boards green? The material ...,12,sci.electronics
18842,In article <1r941o$3tu@menudo.uh.edu> inde7wv@...,8,rec.motorcycles
18843,We were told that the resolution on the 5FGe c...,4,comp.sys.mac.hardware
18844,CAD Setup For Sale:\n\nG486PLB Local Bus Mothe...,6,misc.forsale


# Dataset preprocessing

In [None]:
%%capture
!pip install -U gensim

In [None]:
from gensim.utils import tokenize
from gensim.parsing.preprocessing import preprocess_string,strip_tags,strip_punctuation,strip_numeric,remove_stopwords,strip_short
from gensim.corpora.dictionary import Dictionary
from gensim import models

In [None]:
help(preprocess_string)

Help on function preprocess_string in module gensim.parsing.preprocessing:

preprocess_string(s, filters=[<function <lambda> at 0x7f6163fb0680>, <function strip_tags at 0x7f6163fb0290>, <function strip_punctuation at 0x7f6163fb0200>, <function strip_multiple_whitespaces at 0x7f6163fb04d0>, <function strip_numeric at 0x7f6163fb03b0>, <function remove_stopwords at 0x7f6163fb0170>, <function strip_short at 0x7f6163fb0320>, <function stem_text at 0x7f6163fb05f0>])
    Apply list of chosen filters to `s`.
    
    Default list of filters:
    
    * :func:`~gensim.parsing.preprocessing.strip_tags`,
    * :func:`~gensim.parsing.preprocessing.strip_punctuation`,
    * :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`,
    * :func:`~gensim.parsing.preprocessing.strip_numeric`,
    * :func:`~gensim.parsing.preprocessing.remove_stopwords`,
    * :func:`~gensim.parsing.preprocessing.strip_short`,
    * :func:`~gensim.parsing.preprocessing.stem_text`.
    
    Parameters
    -------

In [None]:
dataset_df['Clean_news'] = dataset_df['News'].apply(preprocess_string)

In [None]:
dataset_df

Unnamed: 0,News,Label,Label_name,Clean_news
0,gajarsky@pilot.njin.net writes:\n\nmorgan and ...,9,rec.sport.baseball,"[gajarski, pilot, njin, net, write, morgan, gu..."
1,"Well, I just got my Centris 610 yesterday. It...",4,comp.sys.mac.hardware,"[got, centri, yesterdai, took, week, place, or..."
2,Archive-name: cryptography-faq/part10\nLast-mo...,11,sci.crypt,"[archiv, cryptographi, faq, modifi, faq, sci, ..."
3,> ATTENTION: Mac Quadra owners: Many storage i...,4,comp.sys.mac.hardware,"[attent, mac, quadra, owner, storag, industri,..."
4,bobbe@vice.ICO.TEK.COM (Robert Beauchaine) wri...,0,alt.atheism,"[bobb, vice, ico, tek, com, robert, beauchain,..."
...,...,...,...,...
18841,\nWhy are circuit boards green? The material ...,12,sci.electronics,"[circuit, board, green, materi, goe, name, cir..."
18842,In article <1r941o$3tu@menudo.uh.edu> inde7wv@...,8,rec.motorcycles,"[articl, indewv, rosi, edu, write, bike, lucki..."
18843,We were told that the resolution on the 5FGe c...,4,comp.sys.mac.hardware,"[told, resolut, fge, anybodi, tri, run, higher..."
18844,CAD Setup For Sale:\n\nG486PLB Local Bus Mothe...,6,misc.forsale,"[cad, setup, sale, gplb, local, bu, motherboar..."


In [None]:
filters=[lambda x: x.lower(),strip_tags,strip_punctuation,strip_numeric,remove_stopwords,strip_short]
dataset_df['Clean_news1'] = dataset_df['News'].apply(lambda x: preprocess_string(x,filters))

In [None]:
dataset_df

Unnamed: 0,News,Label,Label_name,Clean_news,Clean_news1
0,gajarsky@pilot.njin.net writes:\n\nmorgan and ...,9,rec.sport.baseball,"[gajarski, pilot, njin, net, write, morgan, gu...","[gajarsky, pilot, njin, net, writes, morgan, g..."
1,"Well, I just got my Centris 610 yesterday. It...",4,comp.sys.mac.hardware,"[got, centri, yesterdai, took, week, place, or...","[got, centris, yesterday, took, weeks, placing..."
2,Archive-name: cryptography-faq/part10\nLast-mo...,11,sci.crypt,"[archiv, cryptographi, faq, modifi, faq, sci, ...","[archive, cryptography, faq, modified, faq, sc..."
3,> ATTENTION: Mac Quadra owners: Many storage i...,4,comp.sys.mac.hardware,"[attent, mac, quadra, owner, storag, industri,...","[attention, mac, quadra, owners, storage, indu..."
4,bobbe@vice.ICO.TEK.COM (Robert Beauchaine) wri...,0,alt.atheism,"[bobb, vice, ico, tek, com, robert, beauchain,...","[bobbe, vice, ico, tek, com, robert, beauchain..."
...,...,...,...,...,...
18841,\nWhy are circuit boards green? The material ...,12,sci.electronics,"[circuit, board, green, materi, goe, name, cir...","[circuit, boards, green, material, goes, names..."
18842,In article <1r941o$3tu@menudo.uh.edu> inde7wv@...,8,rec.motorcycles,"[articl, indewv, rosi, edu, write, bike, lucki...","[article, indewv, rosie, edu, writes, bike, lu..."
18843,We were told that the resolution on the 5FGe c...,4,comp.sys.mac.hardware,"[told, resolut, fge, anybodi, tri, run, higher...","[told, resolution, fge, anybody, tried, runnin..."
18844,CAD Setup For Sale:\n\nG486PLB Local Bus Mothe...,6,misc.forsale,"[cad, setup, sale, gplb, local, bu, motherboar...","[cad, setup, sale, gplb, local, bus, motherboa..."


In [None]:
dataset_dictionary = Dictionary(dataset_df['Clean_news1'])

In [None]:
len(dataset_dictionary)

96460

In [None]:
print(dataset_dictionary.token2id)



In [None]:
dataset_corpus_bow = [dataset_dictionary.doc2bow(text) for text in dataset_df['Clean_news1']] #create a dataset corpus with bag of word vectorization

In [None]:
len(dataset_corpus_bow)

18846

In [None]:
print(dataset_corpus_bow[1])

[(22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 2), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 3), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 2), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 2), (66, 1), (67, 1), (68, 2), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 2), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 2), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1)]


In [None]:
tfidf = models.TfidfModel(dataset_corpus_bow)
dataset_corpus_tfidf = tfidf[dataset_corpus_bow]

In [None]:
len(dataset_corpus_tfidf)

18846

In [None]:
print(dataset_corpus_tfidf[1])

[(22, 0.12794312043780054), (23, 0.1032933602529823), (24, 0.1437906445046912), (25, 0.19446130648981633), (26, 0.09972437101248886), (27, 0.19446130648981633), (28, 0.056593976938038), (29, 0.09712742378308543), (30, 0.11391287593244794), (31, 0.08578843198010519), (32, 0.19446130648981633), (33, 0.10142837384435295), (34, 0.09687548915310225), (35, 0.10125120507490903), (36, 0.09133742977605598), (37, 0.061307837508357034), (38, 0.10891119725810347), (39, 0.06606330836365701), (40, 0.08855334717656915), (41, 0.07391023272465086), (42, 0.19446130648981633), (43, 0.11381832135728932), (44, 0.3110653713924378), (45, 0.11566217615948503), (46, 0.0969847936661337), (47, 0.0694228263237942), (48, 0.09383842074796034), (49, 0.09300779209433964), (50, 0.0455655162646588), (51, 0.10391838009790301), (52, 0.07696272067182856), (53, 0.10763740852872361), (54, 0.05222277834588352), (55, 0.05433687654103351), (56, 0.05404373403325169), (57, 0.15338363279035241), (58, 0.08979736303341844), (59, 0.