## Lab 4.3. 

#### Setup your imports

In [252]:
import pandas as pd 
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt

#### 1. Pull the training set from the newsgroup data
The data has 20 different categories. Try to shrink down to smaller number of groups according to the definition here:
http://scikit-learn.org/stable/datasets/twenty_newsgroups.html

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [11]:
data_train = fetch_20newsgroups(subset='train',
                               shuffle=True, random_state=42,
                               remove=('headers', 'footers', 'quotes'))

In [14]:
len(data_test.data)

7532

In [108]:
y = data_train.target
x = data_train.data

In [16]:
list(newsgroups_train.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [199]:
mappy = {
    0: [0],
    1: [1,2,3,4,5],
    2: [6],
    3: [7,8,9,10],
    4: [11,12,13,14],
    5: [15],
    6: [16,17,18,19]
}

def getkey(num):
    for x, y in mappy.items():
        if num in y:
            return x
y_new = [getkey(num) for num in y]
y = y_new
labels = ['alt','comp','misc','rec','sci','soc','talk']

In [18]:
np.shape(y)

(11314,)

#### 2. Create the vectorizer 

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
                        max_features=1000,
                        token_pattern='[a-zA-Z]{3,50}',
                        ngram_range=(1, 1), 
                        max_df=0.1, 
                        min_df=0.0001,
                        stop_words='english',
)

x_ = tfidf.fit_transform(x)
xtfidf = tfidf.get_feature_names()

# X_all = pd.DataFrame(x_.todense(), columns=tfidf.get_feature_names())

What are top 50 most powerful terms in deciding news groups? (hint: treat it as a classification problem)?

In [41]:
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k=50)
selected_data = selector.fit_transform(X_all, y)
kbest_columns = X_all.columns[selector.get_support()]
Xbest = pd.DataFrame(selected_data, columns=kbest_columns)

In [42]:
kbest_columns

Index([u'asking', u'atheism', u'atheist', u'atheists', u'belief', u'bible',
       u'bike', u'car', u'card', u'catholic', u'christ', u'christian',
       u'christianity', u'christians', u'church', u'clipper', u'condition',
       u'dos', u'encryption', u'faith', u'file', u'game', u'god',
       u'government', u'graphics', u'gun', u'heaven', u'hockey', u'israel',
       u'israeli', u'jesus', u'league', u'manuals', u'offer', u'people',
       u'players', u'religion', u'resurrection', u'sale', u'scripture',
       u'season', u'sell', u'shipping', u'sin', u'software', u'space', u'team',
       u'thanks', u'using', u'windows'],
      dtype='object')

#### 3. Create the Truncated Singular Value Decomposition. 

In [274]:
v = TfidfVectorizer(
                        max_features=100,
                        token_pattern='[a-zA-Z]{3,50}',
                        ngram_range=(1, 1), 
                        max_df=0.1, 
                        min_df=0.0001,
                        stop_words='english')
X_V = v.fit_transform(x)

v_cols = v.get_feature_names()

# X_all = pd.DataFrame(x_.todense(), columns=tfidf.get_feature_names())

In [256]:
# pd.DataFrame(X_V.todense(), columns = v.get_feature_names())
# v_cols

In [275]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=72, n_iter=7, random_state=42)
svd.fit(X_V)

print svd.explained_variance_ratio_
svd.explained_variance_ratio_.sum()

[ 0.01329341  0.02474308  0.0218014   0.01956765  0.01624408  0.01593683
  0.01577902  0.01516742  0.01484366  0.0143997   0.01420579  0.01397389
  0.01347224  0.0129579   0.01283648  0.01274955  0.01262152  0.01247336
  0.01236118  0.01217708  0.01193953  0.01163752  0.01160214  0.01153023
  0.01129159  0.01121802  0.01117062  0.01104169  0.01092708  0.01073319
  0.01067812  0.01058831  0.01056545  0.01047611  0.0103895   0.01033147
  0.01030787  0.01022153  0.01012416  0.0100339   0.00993493  0.00988739
  0.00983315  0.00979431  0.00977031  0.00968884  0.00964164  0.00956819
  0.00955474  0.00946544  0.00940647  0.00934441  0.00927374  0.00920649
  0.00912334  0.009104    0.00908779  0.00904201  0.00895932  0.00895198
  0.008835    0.00879819  0.00870057  0.00863383  0.00853014  0.00836084
  0.0083096   0.00821943  0.00808298  0.00798678  0.00791606  0.00783853]


0.80726371651033679

In [269]:
df = pd.DataFrame(svd.components_).T

In [270]:
df.index = v_cols

In [271]:
df[0].sort_values(ascending=False)

did           0.187502
edu           0.184342
really        0.166639
problem       0.164431
god           0.151855
work          0.150915
said          0.149788
years         0.145207
going         0.140322
believe       0.140164
year          0.138929
got           0.132601
sure          0.130477
point         0.128790
using         0.126926
things        0.126024
better        0.124803
help          0.118180
thing         0.117927
let           0.117805
doesn         0.116295
question      0.114439
government    0.113466
long          0.113300
case          0.111921
probably      0.111649
didn          0.111414
read          0.110286
little        0.109906
mail          0.109052
                ...   
called        0.076540
data          0.076359
end           0.075945
line          0.074795
problems      0.074374
group         0.074318
support       0.073284
space         0.073225
key           0.072822
non           0.072774
card          0.072473
order         0.072078
team       

#### 4. Setup your k-means clustering

In [262]:
X_new = svd.transform(X_V)

from sklearn import cluster
k = 7
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(X_new)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=7, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [263]:
labels = kmeans.labels_
labels

array([6, 6, 2, ..., 1, 6, 6], dtype=int32)

In [264]:
metrics.silhouette_score(X_new,labels, metric='euclidean')

0.058428812960829142

#### 5. Fit the vectorizer and SVD

What are top 50 most useful terms based on article itself? Are those terms similar to the top 50 from step 2? 

Plot the cummulative variace from the terms. x-axis: number of components; y-axis: cummulative variance. 
Based on the plot, decide how many principle components you need. 

#### 7. Fit the kmeans (Question: in this case, do you recommend running K-means without dimension reduction?)

Print out your centroids. Look at the value for each centroid. Does each centroid represent a news group as expected? 

#### 8. Check the performance of our kmeans

In [268]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels))
print("Completeness: %0.3f" % metrics.completeness_score(y, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(y, labels))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(y, labels))

Homogeneity: 0.114
Completeness: 0.138
V-measure: 0.124
Adjusted Rand-Index: 0.045


#### Classification Report

In [265]:
print classification_report(y,labels)

             precision    recall  f1-score   support

          0       0.19      0.17      0.18       480
          1       0.49      0.07      0.13      2936
          2       0.02      0.12      0.04       585
          3       0.01      0.00      0.01      2389
          4       0.15      0.04      0.06      2373
          5       0.00      0.00      0.00       599
          6       0.13      0.36      0.19      1952

avg / total       0.19      0.10      0.09     11314



#### Confusion Matrix. Hint: create a map to translate the label between k-means clustering and the original target (newsgroups_train.target). 

In [266]:
confusion_matrix(y, labels)

array([[  82,   11,  183,    2,   27,    2,  173],
       [   3,  219,  260,  560,   57,  331, 1506],
       [   1,    9,   68,   35,    4,   99,  369],
       [  16,   55,  765,    8,  165,  236, 1144],
       [   6,   92,  792,   30,   89,   27, 1337],
       [ 238,   16,  172,    0,   31,    0,  142],
       [  87,   43,  893,    5,  203,   13,  708]])

#### Note: Repeat the lab with:
- varying values of "k" 
- trying a different way to pick starting centroids ('k-means++' is the default method for centroids). For example, pick one point from each newsgroup. 

#### Learnt that feature engineering is more important. The scores for this lab are not great as the words for each news group is not selected. TFIDVECTORIZER creates features for all common words, but does not provide any weitage per news group. Distance between words for each news group is not obvious and hence the scores are bad. Due to lack of time, I will not spend more time on feature engineering for now. This Lab just helps in undertstanding the steps involved to create features from texts and then use trancated SVD to reduce dimensions, and then apply on Kme