## Preliminaries

#### Imports

In [2]:
import string
import os
import pickle
from collections import Counter 

import numpy as np
import pandas as pd

## used to tokenized the words
from nltk.tokenize import word_tokenize
## stem the words
from nltk.stem.porter import PorterStemmer
## stopwords
from nltk.corpus import stopwords
##  tfidf 
from sklearn.feature_extraction.text import TfidfVectorizer

import sys 
sys.path.append("../..")
from E4525_ML import text # you must have saved the file text.py into the E4525_ML directory

#### Data Directories

In [3]:
raw_data_dir=r"../../raw/C50/C50train" # original data set used for training
data_dir    =r"../../data/C50/"  # directory to save intermediate results

#### Convenience Function Definitions

A few functions carried over from the Text_Features notebook that we will need during this exercise.

In [4]:
def process_text(filename,stop): 
    '''
    @filename: the name of text file to read
    @stop: stop words 
    read text, stem the vocabularies, remove the stop words
    '''
    porter_stemmer = PorterStemmer()
    file=open(filename)
    lines=file.readlines()
    # use space to seperate lines and all change into lower letter
    text_str=" ".join(lines).replace("\n"," ").lower()
    stem_list=text.stem_tokenizer(text_str)
    # refine the list by taking off the stop words
    used_list=[token for token in stem_list if token not in stop]
    return used_list

In [5]:
def text_2_set(filename,stop_words):
    # return the set of stem words - each vocabulary has value 0/1 in each corpus
    stems=process_text(filename,stop_words)
    return set(stems)

In [6]:
def text_2_counts(filename,stop_words):
    # return the count of stem words
    stems=process_text(filename,stop_words)
    return Counter(stems)

In [7]:
def corpus_word_counts(documents,stop):
    '''
    @documents: a list of documents that we are interested in
    @stop: stop words
    return how many documents have each of the vocabularies 
    '''
    counts=Counter()
    for filename in documents["filename"]:   
        print("processing...",filename)
        bag=text_2_set(filename,stop)
        for word in bag:
            counts[word]+=1
    return pd.DataFrame.from_dict(counts,orient="index")

## Environment Preparation

<div class="alert alert-block alert-info"> Problem 0 </div>

1. Download the  [Reuters 50](https://archive.ics.uci.edu/ml/datasets/Reuter_50_50) collection of texts. Save it on the `raw` data directory.

    You should end up with this directory structure structure:
    
    raw/
        C50/
            C50train/
            C50test/
            
1. Run to completion the [Text Feature Extraction](./Text_Features.ipynb) notebook. This will generate the document lists, and word count statistics. Make sure to run any of the sections are are meant to be run only once.
1. Save the text.py python module into the `E4525_ML` directory.

## Implement TF-IDF document Distance with Sublinear Growth 

<div class="alert alert-block alert-info"> Problem 1.1 </div>

Read the list of documents in the file `C50_documents.csv`  from the data directory `data_dir` into a `documents` variable

In [8]:
def author_labels(directory):
    doc_labels=[]
    for author in os.listdir(directory):
        for filename in os.listdir(directory+"/"+author):
            filename=directory+"/"+author+"/"+filename
            doc_labels.append([filename,author])
    data=pd.DataFrame(doc_labels,columns=["filename","label"])
    return data

In [9]:
## used to return list of name in directory
os.listdir(raw_data_dir)[:5]

['RobinSidel',
 'LynnleyBrowning',
 'KouroshKarimkhany',
 'MichaelConnor',
 'JoeOrtiz']

In [10]:
documents_filename=data_dir+"/C50_documents.csv"

documents=author_labels(raw_data_dir)
documents.to_csv(documents_filename,index_label="document_id")

documents=pd.read_csv(documents_filename,index_col="document_id")
documents.head()

Unnamed: 0_level_0,filename,label
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,../../raw/C50/C50train/RobinSidel/147604newsML...,RobinSidel
1,../../raw/C50/C50train/RobinSidel/196812newsML...,RobinSidel
2,../../raw/C50/C50train/RobinSidel/219316newsML...,RobinSidel
3,../../raw/C50/C50train/RobinSidel/251225newsML...,RobinSidel
4,../../raw/C50/C50train/RobinSidel/177958newsML...,RobinSidel


<div class="alert alert-block alert-info"> Problem 1.2 </div>

Create a list of stop works by calling the function `text.stop_words` from the `E4525.text` python module.

In [11]:
stop_words=list(text.stop_words())
stop_words[:5]

['can', ';', 'which', 'she', 'need']

<div class="alert alert-block alert-info"> Problem 1.3 </div>

Using pandas, read  the word count (term frequencies) file generated by the Text_Features notebook
The file is called "corpus_word_counts.csv"

In [12]:
word_counts_filename=data_dir+"corpus_word_counts.csv"

word_counts=corpus_word_counts(documents,stop_words)
word_counts = word_counts.rename(columns={'index':'word', 0:'count'})
word_counts.to_csv(word_counts_filename,index_label="word")

word_counts=pd.read_csv(word_counts_filename,index_col="word")
word_counts.describe()

processing... ../../raw/C50/C50train/RobinSidel/147604newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/196812newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/219316newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/251225newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/177958newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/163815newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/198999newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/196990newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/110986newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/216335newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/153574newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/207287newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/142837newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/168806newsML.txt
processing... ../../raw/C50/C50train/RobinSidel/163749newsML.txt
processing... ../../raw/C

processing... ../../raw/C50/C50train/KouroshKarimkhany/307652newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/104417newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/121032newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/117158newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/160129newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/272490newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/287103newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/218300newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/146032newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/311972newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/101520newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/331500newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/167199newsML.txt
processing... ../../raw/C50/C50train/KouroshKarimkhany/183208new

processing... ../../raw/C50/C50train/EricAuchard/216338newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/18223newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/151452newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/210069newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/263257newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/111437newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/13068newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/121521newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/149671newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/120484newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/260570newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/115060newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/233319newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/227670newsML.txt
processing... ../../raw/C50/C50train/EricAuchard/264271newsML.txt
processing..

processing... ../../raw/C50/C50train/SimonCowell/403689newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/450839newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/361250newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/420566newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/255730newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/440535newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/242983newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/402074newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/356525newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/303707newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/390568newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/406879newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/447824newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/350608newsML.txt
processing... ../../raw/C50/C50train/SimonCowell/347888newsML.txt
processing

processing... ../../raw/C50/C50train/KevinMorrison/116890newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/256319newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/114198newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/198071newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/133461newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/241149newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/304259newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/218602newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/21536newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/184096newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/247636newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/235532newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/256350newsML.txt
processing... ../../raw/C50/C50train/KevinMorrison/278563newsML.txt
processing... ../../raw/C50/C50train/KevinMorriso

processing... ../../raw/C50/C50train/PatriciaCommins/123595newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/142683newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/270137newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/108443newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/123987newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/251195newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/288774newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/102793newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/264335newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/143591newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/178089newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/140446newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/120588newsML.txt
processing... ../../raw/C50/C50train/PatriciaCommins/251347newsML.txt
processing... ../../

processing... ../../raw/C50/C50train/KevinDrawbaugh/113464newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/137481newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/305957newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/154378newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/249404newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/178086newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/105399newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/259009newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/100163newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/299416newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/290996newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/154388newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/173045newsML.txt
processing... ../../raw/C50/C50train/KevinDrawbaugh/317846newsML.txt
processing... ../../raw/C50/C50tra

processing... ../../raw/C50/C50train/MartinWolk/158943newsML.txt
processing... ../../raw/C50/C50train/MartinWolk/202582newsML.txt
processing... ../../raw/C50/C50train/MartinWolk/2538newsML.txt
processing... ../../raw/C50/C50train/MartinWolk/183836newsML.txt
processing... ../../raw/C50/C50train/MartinWolk/157448newsML.txt
processing... ../../raw/C50/C50train/MartinWolk/132566newsML.txt
processing... ../../raw/C50/C50train/MartinWolk/127151newsML.txt
processing... ../../raw/C50/C50train/MartinWolk/15583newsML.txt
processing... ../../raw/C50/C50train/MartinWolk/130607newsML.txt
processing... ../../raw/C50/C50train/ScottHillis/253868newsML.txt
processing... ../../raw/C50/C50train/ScottHillis/208266newsML.txt
processing... ../../raw/C50/C50train/ScottHillis/140340newsML.txt
processing... ../../raw/C50/C50train/ScottHillis/310466newsML.txt
processing... ../../raw/C50/C50train/ScottHillis/318742newsML.txt
processing... ../../raw/C50/C50train/ScottHillis/199747newsML.txt
processing... ../../ra

processing... ../../raw/C50/C50train/FumikoFujisaki/326601newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/271169newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/103993newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/10028newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/184337newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/209817newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/123528newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/155075newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/208614newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/215310newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/221583newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/168224newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/348849newsML.txt
processing... ../../raw/C50/C50train/FumikoFujisaki/257487newsML.txt
processing... ../../raw/C50/C50trai

processing... ../../raw/C50/C50train/NickLouth/110904newsML.txt
processing... ../../raw/C50/C50train/NickLouth/120591newsML.txt
processing... ../../raw/C50/C50train/NickLouth/116176newsML.txt
processing... ../../raw/C50/C50train/NickLouth/180293newsML.txt
processing... ../../raw/C50/C50train/NickLouth/108449newsML.txt
processing... ../../raw/C50/C50train/NickLouth/123878newsML.txt
processing... ../../raw/C50/C50train/NickLouth/159095newsML.txt
processing... ../../raw/C50/C50train/NickLouth/18798newsML.txt
processing... ../../raw/C50/C50train/NickLouth/162173newsML.txt
processing... ../../raw/C50/C50train/NickLouth/162521newsML.txt
processing... ../../raw/C50/C50train/NickLouth/13930newsML.txt
processing... ../../raw/C50/C50train/NickLouth/16038newsML.txt
processing... ../../raw/C50/C50train/NickLouth/121030newsML.txt
processing... ../../raw/C50/C50train/DarrenSchuettler/174369newsML.txt
processing... ../../raw/C50/C50train/DarrenSchuettler/205458newsML.txt
processing... ../../raw/C50/C

processing... ../../raw/C50/C50train/TanEeLyn/41230newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/417677newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/123487newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/155113newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/253869newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/281155newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/426779newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/241238newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/186184newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/381228newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/283835newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/192410newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/108092newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/190418newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/169609newsML.txt
processing... ../../raw/C50/C50train/TanEeLyn/192393news

processing... ../../raw/C50/C50train/HeatherScoffield/191069newsML.txt
processing... ../../raw/C50/C50train/HeatherScoffield/165906newsML.txt
processing... ../../raw/C50/C50train/HeatherScoffield/298748newsML.txt
processing... ../../raw/C50/C50train/HeatherScoffield/248690newsML.txt
processing... ../../raw/C50/C50train/HeatherScoffield/129447newsML.txt
processing... ../../raw/C50/C50train/MureDickie/138551newsML.txt
processing... ../../raw/C50/C50train/MureDickie/186174newsML.txt
processing... ../../raw/C50/C50train/MureDickie/225075newsML.txt
processing... ../../raw/C50/C50train/MureDickie/105255newsML.txt
processing... ../../raw/C50/C50train/MureDickie/137508newsML.txt
processing... ../../raw/C50/C50train/MureDickie/140299newsML.txt
processing... ../../raw/C50/C50train/MureDickie/11265newsML.txt
processing... ../../raw/C50/C50train/MureDickie/187394newsML.txt
processing... ../../raw/C50/C50train/MureDickie/211093newsML.txt
processing... ../../raw/C50/C50train/MureDickie/204883newsML.

processing... ../../raw/C50/C50train/JimGilchrist/147557newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/133504newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/171185newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/147518newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/166456newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/103904newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/126622newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/180176newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/110715newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/143463newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/108091newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/159926newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/120244newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/126597newsML.txt
processing... ../../raw/C50/C50train/JimGilchrist/138486newsML

processing... ../../raw/C50/C50train/JonathanBirt/225509newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/169040newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/29057newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/250638newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/293044newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/329893newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/200679newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/338594newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/331672newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/340119newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/289165newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/172185newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/300439newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/164502newsML.txt
processing... ../../raw/C50/C50train/JonathanBirt/163135newsML.

processing... ../../raw/C50/C50train/TheresePoletti/151061newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/130612newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/309865newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/186457newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/187047newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/27551newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/101565newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/240007newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/18215newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/283077newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/246460newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/203028newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/301796newsML.txt
processing... ../../raw/C50/C50train/TheresePoletti/146012newsML.txt
processing... ../../raw/C50/C50train

processing... ../../raw/C50/C50train/MarkBendeich/164991newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/115420newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/118468newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/27264newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/135098newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/289652newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/370908newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/33308newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/364654newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/294969newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/30686newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/21528newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/358701newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/402409newsML.txt
processing... ../../raw/C50/C50train/MarkBendeich/294948newsML.txt

processing... ../../raw/C50/C50train/MatthewBunce/308257newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/165943newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/343415newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/280207newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/263657newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/293127newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/156243newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/169229newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/233655newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/318283newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/248714newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/362649newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/288284newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/302468newsML.txt
processing... ../../raw/C50/C50train/MatthewBunce/107670newsML

processing... ../../raw/C50/C50train/TimFarrand/159528newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/152851newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/143038newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/234885newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/165801newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/225561newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/164516newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/164877newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/154492newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/229685newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/201017newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/160644newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/167803newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/242992newsML.txt
processing... ../../raw/C50/C50train/TimFarrand/238095newsML.txt
processing... ../../raw/C

processing... ../../raw/C50/C50train/GrahamEarnshaw/231076newsML.txt
processing... ../../raw/C50/C50train/GrahamEarnshaw/13495newsML.txt
processing... ../../raw/C50/C50train/GrahamEarnshaw/198148newsML.txt
processing... ../../raw/C50/C50train/GrahamEarnshaw/113102newsML.txt
processing... ../../raw/C50/C50train/GrahamEarnshaw/201228newsML.txt
processing... ../../raw/C50/C50train/GrahamEarnshaw/267059newsML.txt
processing... ../../raw/C50/C50train/BernardHickey/155005newsML.txt
processing... ../../raw/C50/C50train/BernardHickey/103816newsML.txt
processing... ../../raw/C50/C50train/BernardHickey/223856newsML.txt
processing... ../../raw/C50/C50train/BernardHickey/148778newsML.txt
processing... ../../raw/C50/C50train/BernardHickey/223842newsML.txt
processing... ../../raw/C50/C50train/BernardHickey/111971newsML.txt
processing... ../../raw/C50/C50train/BernardHickey/322996newsML.txt
processing... ../../raw/C50/C50train/BernardHickey/294933newsML.txt
processing... ../../raw/C50/C50train/Bernar

processing... ../../raw/C50/C50train/AlexanderSmith/134290newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/110282newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/239202newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/164287newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/237953newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/107525newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/307212newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/141391newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/185613newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/238090newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/162656newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/141943newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/223283newsML.txt
processing... ../../raw/C50/C50train/AlexanderSmith/188417newsML.txt
processing... ../../raw/C50/C50tra

Unnamed: 0,count
count,28131.0
mean,17.188937
std,73.939306
min,1.0
25%,1.0
50%,2.0
75%,6.0
max,2482.0


In [16]:
word_counts.head(5)

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
real,215
overlap,33
group,1058
bosshard,3
mani,626


In [14]:
type(word_counts)

pandas.core.frame.DataFrame

<div class="alert alert-block alert-info"> Problem 1.4 </div>
Create a variable $V$ with the vocabulary size  and a variable named $C$ with the total number of documents

In [15]:
C=len(documents)
V=len(word_counts)
print(C,V)

2500 28131


<div class="alert alert-block alert-info"> Problem 1.5 </div>
Compute the smoothed inverse document counts, defined as
$$
    \textrm{idf}_i =  \log\left( \frac{1+C}{1+\textrm{n}_i}\right) + 1
$$

where $n_i$ is the number of documents in corpus where word $i$ appears.

In [27]:
idfs=np.log((1+C)/(1+word_counts))+1
idfs.head()

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
real,3.449168
overlap,5.298085
group,1.859366
bosshard,7.438152
mani,2.383499


<div class="alert alert-block alert-success"> We set up a few documents for comparison</div>

[HINT] Code below assumes that the variable `documents`  is the list of documents you read in problem 1.1

In [23]:
# document indexes we will use for comparison
document1=0 
document2=1
document3=105

# document filenames
filename1=documents["filename"][document1]
filename2=documents["filename"][document2]
filename3=documents["filename"][document3] # this will be from a different author

<div class="alert alert-block alert-info"> Problem 1.6 </div>
    Compute the word counts for `documents1`,`document2` and `document3`, using the `text_2_count` function defined at the beginning of the notebook.

In [24]:
count1=text_2_counts(filename1, stop_words)
count2=text_2_counts(filename2, stop_words)
count3=text_2_counts(filename3, stop_words)

### Classical Tf-Idf

The function below computes the normalized product of  `tfidf`  vectors.
Where the `tfidf` vector is defined as follows
$$
    w_{k} = \textrm{idf_k} * c_{k}
$$
where  $c_{k}$ is the number of times that word $k$ appears in document.

In [None]:
def product_tfidf(count1,count2,idfs):
    sum1=0.0
    sum_cross=0.0
    ## for key in count1: 数字数
    for key in count1:
        if key not in idfs.index:
            idf=0
            print(f"key {key} not found")
        else:
            # idf=this key's idf count
            idf=idfs.loc[key]["count"]
        # w1: weight for this key in document1
        # w2: weight for this key in document2
        w1=idf*count1[key]
        w2=idf*count2[key]
        # at last, sum1 = w1_key1**2 + w1_key2**2 + ... + w1_keyn**2
        sum1+=(w1)**2
        # sum_cross = w1_key1 * w2_key1 + w1_key2 * w2_key2 + ...
        sum_cross+=w1*w2
    sum2=0.0
    for key in count2:
        if key not in idfs.index:
            idf=0
            print(f"key {key} not found")
        else:
            idf=idfs.loc[key]["count"]
        w2=idf*count2[key]
        sum2+=w2**2
    return sum_cross/np.sqrt(sum1*sum2)

### Sub-Linear Tf-Idf

It seems unlikely that 20 occurrences of a term in a document truly carry $20\times$ the significance of a single occurrence. And alternative (see the [Information Retrieval book](https://nlp.stanford.edu/IR-book/html/htmledition/sublinear-tf-scaling-1.html)) is to use a function
to *tamper* the growth of the word counts.

<div class="alert alert-block alert-info"> Problem 1.6 </div>
Create a function named `sublinear_product_tfidf`.
It should compute the normalized product of `tfidf` vectors as above but using a **`sublinear`** measure of  the word counts, defined as:
\begin{align}
    w_k  &= idf_k * (1+\log c_k)  &\textrm{if}\,\, c_k &>0 \\
    w_k  &= 0                    &\textrm{if}\,\, c_k &=0 \\
\end{align}
where $c_k$ is the raw word count for word $k$.

[HINT] Probably easiest to copy and modify slightly the function  `product_idf` above

In [30]:
def sublinear_product_tfidf(count1,count2,idfs):
    sum1=0.0
    sum_cross=0.0
    for key in count1:
        w1=0.0
        w2=0.0
        if key not in idfs.index:
            idf=0
            print(f"key {key} not found")
        else:
            idf=idfs.loc[key]["count"]
        if count1[key] != 0:
            w1=idf*(1+np.log(count1[key]))
        if count2[key] != 0:
            w2=idf*(1+np.log(count2[key]))
        sum1+=(w1)**2
        sum_cross+=w1*w2
    sum2=0.0
    for key in count2:
        w2=0.0
        if key not in idfs.index:
            idf=0
            print(f"key {key} not found")
        else:
            idf=idfs.loc[key]["count"]
        if count2[key]!=0:
            w2=idf*(1+np.log(count2[key]))
        sum2+=w2**2
    return sum_cross/np.sqrt(sum1*sum2)

<div class="alert alert-block alert-info"> Problem 1.7 </div>
Compute the sublinear normalized product (similarity) for `document1` with itself, verify that the product is 1

In [31]:
## count of document1, count of document2, idfs - idfs of the whole corpus 
sublinear_product_tfidf(count1, count1, idfs)

1.0

<div class="alert alert-block alert-info"> Problem 1.8 </div>
Compute the sublinear normalized products between 
1. `document1` and `document2`
2. `document1` and `document3`
3. `document2` and `document3`

In [32]:
r1=sublinear_product_tfidf(count1, count2, idfs)
r2=sublinear_product_tfidf(count1, count3, idfs)
r3=sublinear_product_tfidf(count2, count3, idfs)
print(f'The sublinear normalized products of document1 and document2 is {r1}')
print(f'The sublinear normalized products of document1 and document3 is {r2}')
print(f'The sublinear normalized products of document2 and document3 is {r3}')

The sublinear normalized products of document1 and document2 is0.12787626323711282
The sublinear normalized products of document1 and document3 is0.04982868267860495
The sublinear normalized products of document2 and document3 is0.0352982411561606


## Comparison to  `sklearn`

<div class="alert alert-block alert-info"> Problem 2.1 </div>
store the value of the function `text.stem_tokenizer` from the module `text.py` into variable named `tokenizer`.

In [34]:
tokenizer=text.stem_tokenizer

<div class="alert alert-block alert-info"> Problem 2.2 </div>

set up  an instance of [`sklearn.TfidfVectorizer`](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)  so that it generates `tfidf` vectors using sublinear growth.

[Hint] 
1. Read carefully the  long list of options on the constructor of `TfidfVectorizer`
2. Do not forget to set the `input`, `tokenizer` and `stop_word` arguments.
    

In [35]:
sub_=TfidfVectorizer(input='filename',sublinear_tf=True, stop_words=stop_words, tokenizer=tokenizer)

<div class="alert alert-block alert-info"> Problem 2.3 </div>
Generate the matrix $X$ of `tfidf` representations for each document in our corpus (this may take a bit of time)

In [37]:
X=sub_.fit_transform(documents["filename"])

  'stop_words.' % sorted(inconsistent))


<div class="alert alert-block alert-info"> Problem 2.4 </div>
Compute the dot product between `document1` and `document2` using their vector (`X`) representation. 

Compare to the result produced by the `sublinear_product_tfidf`
function you just wrote. They should be nearly identical.

In [38]:
np.dot(X[document1], X[document2].T) [0, 0]

array(0.12787626)

In [82]:
sublinear_product_tfidf(count1, count2, idfs)

0.12787626323711282

### Saving Trained models for Reuse

<div class="alert alert-block alert-info"> Problem 3.1 </div>
In the data directory `data_dir`:
1. Save vectorizer to a `pickle` called "tfidf_sublinear_vectorizer.p"
2. Save sublinear `tfidf1` features to a file called "tfidf_sublinear_features.p"

In [88]:
tfidf_vectorizer_filename=   data_dir+"/tfidf_vectorizer.p"
tfidf_features_filename=     data_dir+"/tfidf_sublinear_features.p"

pickle.dump(sub_, open( tfidf_vectorizer_filename, "wb" ) )
pickle.dump(X,              open( tfidf_features_filename, "wb" ) )

<div class="alert alert-block alert-info"> Problem 3.2 </div>
Make sure you can read those files again

In [90]:
tfidf_read=pickle.load(open(tfidf_vectorizer_filename, "rb" ))
X_read=pickle.load(open(tfidf_features_filename, "rb" ) )
print(tfidf_read, X_read)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='filename', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['more', 'yourself', 'hi', 'we', 'into', 'd', '[',
                            '@', "'d", 'my', 'did', 'dure', 'with', 'been',
                            'yourselv', 'am', 'didn', 'through', '#', ';', 'by',
                            'could', 'about', 'wouldn', '?', 'be', 'some',
                            'would', 'over', 'from', ...],
                strip_accents=None, sublinear_tf=True,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function stem_tokenizer at 0x1084591e0>,
                use_idf=True, vocabulary=None)   (0, 10182)	0.03165421189038851
  (0, 21776)	0.03038108597274079
  (0, 26590)	0.0362