# Load Packages

In [1]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups
import nltk
nltk.download('wordnet')
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/co

# Load Data

In [2]:
# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [3]:
df = pd.DataFrame({
    'Text': newsgroups.data,
    'Category': newsgroups.target
})
df['Category Name'] = df['Category'].apply(lambda x: newsgroups.target_names[x])
df

Unnamed: 0,Text,Category,Category Name
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware
...,...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,13,sci.med
18842,\nNot in isolated ground recepticles (usually ...,12,sci.electronics
18843,I just installed a DX2-66 CPU in a clone mothe...,3,comp.sys.ibm.pc.hardware
18844,\nWouldn't this require a hyper-sphere. In 3-...,1,comp.graphics


# Preprocessing

In [5]:
# Lowercasing
df['Text'] = df['Text'].str.lower()

# Remove Punctuation and special characters
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Tokenization
df['Tokens'] = df['Text'].apply(word_tokenize)

# Removing Stopwords
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['Tokens'] = df['Tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [6]:
df

Unnamed: 0,Text,Category,Category Name,Tokens
0,\n\ni am sure some bashers of pens fans are pr...,10,rec.sport.hockey,"[sure, bashers, pen, fan, pretty, confused, la..."
1,my brother is in the market for a highperforma...,3,comp.sys.ibm.pc.hardware,"[brother, market, highperformance, video, card..."
2,\n\n\n\n\tfinally you said what you dream abou...,17,talk.politics.mideast,"[finally, said, dream, mediterranean, new, are..."
3,\nthink\n\nits the scsi card doing the dma tra...,3,comp.sys.ibm.pc.hardware,"[think, scsi, card, dma, transfer, disk, scsi,..."
4,i have an old jasmine drive which i cannot...,4,comp.sys.mac.hardware,"[old, jasmine, drive, use, new, system, unders..."
...,...,...,...,...
18841,dn from nyedacnsvaxuwecedu david nye\ndn a neu...,13,sci.med,"[dn, nyedacnsvaxuwecedu, david, nye, dn, neuro..."
18842,\nnot in isolated ground recepticles usually a...,12,sci.electronics,"[isolated, ground, recepticles, usually, unusu..."
18843,i just installed a dx cpu in a clone motherboa...,3,comp.sys.ibm.pc.hardware,"[installed, dx, cpu, clone, motherboard, tried..."
18844,\nwouldnt this require a hypersphere in space...,1,comp.graphics,"[wouldnt, require, hypersphere, space, point, ..."


# Effect of embeddings on the model performance

**Examine how different embeddings trained on specialised domain affect model performance.**

With a tf-idf baseline, campare the performance of various embedding models:
1. GloVe (General)
2. FastText (Wiki, web crawl, news)
3. Google News Word2Vec (Google News dataset)
4. BERT (Contextual Embeddings)

# LSTM vs CNN for Text Classification (+EDA of data) [Anushka]

**Compare the performances of LSTM vs CNN. LSTM capture long-range dependencies, may out perform CNN. CNNs could be more efficient if the corpus is shorter.**

Look at distribution of classes, text length, word cloud.

# Bidirectional LSTM vs Unidirectional LSTM [Vivien]

**Bidirectional LSTMs provide better context understanding and perform better than unidirectional LSTMs**

# Examine the effect of Dropout Rate/Regularization/Batch Norm/Data Augmentation(back translation, random word insertion/deletion) on Model Generalization [Breandon]

**Train models with varying dropout rates and with and without regularization observe how the model performance is affected on test set. Dropout is typically used to prevent overfitting.**

# Compare effectivness of combining CNN and LSTM Architectures [Leon]

**Hybrid CNN-LSTM architecture may outperform individual CNN/LSTM models**

# Effect of Attention Mechanism on Model Performance [YC]

**Attention model might improve model performance espeically for longer sequences, by helping the model focus on important parts of the corpus.**