# Oracle 10k report clustering exercise

In [1]:
import pandas as pd

In [6]:
df = pd.read_csv(r'../../data/oracle_10k.csv', encoding='latin1')
df.head()

Unnamed: 0,section_id,filename,section_name,section_text
0,1,oracle-corporation_annual_report_1994.docx,ORACLE SYSTEMS FORM 10-K,(Annual Report) Filed 07/27/94 for the Period ...
1,2,oracle-corporation_annual_report_1994.docx,"REDWOOD CITY, CA 94065",Telephone\t6505067000
2,3,oracle-corporation_annual_report_1994.docx,CIK\t0000777676,SIC Code\t7372 - Prepackaged Software Industry...
3,4,oracle-corporation_annual_report_1994.docx,ORACLE CORP /DE/ FORM 10-K,(Annual Report) Filed 7/27/1994 For Period End...
4,5,oracle-corporation_annual_report_1994.docx,SECURITIES AND EXCHANGE COMMISSION,"Washington, D.C. 20549"


### Set X as the section text and fill in any missing values

In [None]:
X = df['section_text']
X.fillna('', inplace=True)

### Use CountVectorizer to transform the section text into a sparse matrix

In [26]:
# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()

# fit and transform the vectorizer on the section names
fit_vect = vect.fit_transform(X)

### Run a K Means clustering on the data

In [27]:
# import KMeans clustering
from sklearn.cluster import KMeans

# instantiate kmeans
km = KMeans()

# fit kmeans on the section names vectors
km.fit(fit_vect)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [28]:
# review the kmeans labels_
km.labels_

array([0, 0, 0, ..., 6, 6, 6])

In [29]:
# add the labels_ to the original dataframe as a new column named cluster_num
df['cluster_num'] = km.labels_

### Create a function to review the created clusters

1. iterate through every cluster_num (i.e. 1, 2, 3, ...)
2. print the name of the current cluster (i.e 'CLUSTER 1', 'CLUSTER 2')
3. filter and print the dataframe to only data from the selected cluster
4. add a empty print() statement to separate the results

In [38]:
def review_clusters(df):
    for num in set(df.cluster_num):
        print('CLUSTER: {}'.format(num))
        print(df[df.cluster_num == num].section_name[0:5])
        print()

In [None]:
# review the results
review_clusters(df)

### Repeat the process using a TfidfVectorizer and KMeans with n_clusters = 30

In [34]:
# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# instantiate, fit, and transform the data with the vectorizer
vect = TfidfVectorizer()
fit_vect = vect.fit_transform(X)

In [35]:
# use kmeans to cluster the data
km = KMeans(n_clusters=30)
km.fit(fit_vect)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=30, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [36]:
# add the labels_ to the original dataframe as a new column
df['cluster_num'] = km.labels_

In [37]:
# review the results
review_clusters(df)

CLUSTER: 0
40                               OTHER INCOME (EXPENSE):
42                    NET INCOME AND EARNINGS PER SHARE:
60     SOFTWARE REVENUE RECOGNITION METHOD, NET EARNI...
143                   NET INCOME AND EARNINGS PER SHARE:
160    OPERATING  INCOME................................
Name: section_name, dtype: object

CLUSTER: 1
8                 PART I ITEM 1. BUSINESS
9                              BACKGROUND
12          COOPERATIVE SERVER TECHNOLOGY
13    COOPERATIVE DEVELOPMENT ENVIRONMENT
14         END USER APPLICATIONS PRODUCTS
Name: section_name, dtype: object

CLUSTER: 2
977                                   MARKETING AND SALES
990     ITEM 7.   MANAGEMENTS DISCUSSION AND ANALYSIS...
1008                      QUARTERLY RESULTS OF OPERATIONS
1015    YEAR ENDED MAY 31, (DOLLARS IN MILLIONS)\t2003...
1017    FACTORS THAT MAY AFFECT OUR FUTURE RESULTS OR ...
Name: section_name, dtype: object

CLUSTER: 3
1463    NEW SOFTWARE LICENSES:\tNEW SOFTWARE LICENSE R...
1466    

### Tune the hyperparameters for TfidfVectorizer and KMeans

1. For TfidfVectorizer - modify ngram_range, min_df, and stop_words
2. For KMeans - modify n_clusters

In [41]:
%%time 

# create the vectorizer
vect = TfidfVectorizer(ngram_range=(1,2), min_df=.01, stop_words='english')
fit_vect = vect.fit_transform(X)

# cluster the data
km = KMeans(n_clusters=20)
km.fit(fit_vect)

# add the clusters to the original data
df['cluster_num'] = km.labels_

# review the results
review_clusters(df)

CLUSTER: 0
1022    (DOLLARS IN MILLIONS) AMORTIZED PRINCIPAL AMOU...
1050                                             EXCHANGE
1051      AS OF MAY 31, (DOLLARS IN MILLIONS)\t2002\t2001
1052                               NOTES PAYABLE AND DEBT
1162    CASH FLOWS FROM FINANCING ACTIVITIES:\tWE INCU...
Name: section_name, dtype: object

CLUSTER: 1
28     PRODUCT AND SERVICES REVENUES
38                         REVENUES:
76               REVENUE RECOGNITION
127    PRODUCT AND SERVICES REVENUES
139                        REVENUES:
Name: section_name, dtype: object

CLUSTER: 2
182           ACQUISITION
757    ITEM 2. PROPERTIES
891    ITEM 2. PROPERTIES
937           ADVERTISING
939     RECLASSIFICATIONS
Name: section_name, dtype: object

CLUSTER: 3
716       SEGMENT INFORMATION
831       SEGMENT INFORMATION
950       SEGMENT INFORMATION
1063      SEGMENT INFORMATION
1121    PRODUCTS AND SERVICES
Name: section_name, dtype: object

CLUSTER: 4
6      SECURITIES REGISTERED PURSUANT TO SECTION 1

### Repeat the process using a DBSCAN clustering algorithm

In [None]:
from sklearn.cluster import  DBSCAN

In [52]:
%%time

# create a tfidfvectorizer
vect = TfidfVectorizer(ngram_range=(1,2), min_df=.01, stop_words='english')
fit_vect = vect.fit_transform(X)

# instantiate and fit a dbscan clustering algorithm
dbscan = DBSCAN(eps=0.5, min_samples=10)
dbscan.fit(fit_vect)

# add the clusters to the original data
df['cluster_num'] = dbscan.labels_

Wall time: 1.82 s


In [53]:
# review the results
review_clusters(df)

CLUSTER: 0
1      REDWOOD CITY, CA 94065
98     REDWOOD CITY, CA 94065
213    REDWOOD CITY, CA 94065
322    REDWOOD CITY, CA 94065
491    REDWOOD CITY, CA 94065
Name: section_name, dtype: object

CLUSTER: 1
2      CIK\t0000777676
99     CIK\t0000777676
214    CIK\t0000777676
323    CIK\t0000777676
492    CIK\t0000777676
Name: section_name, dtype: object

CLUSTER: 2
4                     SECURITIES AND EXCHANGE COMMISSION
7      ORACLE SYSTEMS CORPORATION 1994 FORM 10-K ANNU...
33     ITEM 4. SUBMISSION OF MATTERS TO A VOTE OF SEC...
46     ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCO...
104    SIGNATURES     ..................................
Name: section_name, dtype: object

CLUSTER: 3
45     ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...
146    ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...
253    ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...
362    ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...
532    ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY...
Name: section_na

### Repeat the process using AgglomerativeClustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [50]:
%%time

# create a tfidfvectorizer
vect = TfidfVectorizer(ngram_range=(1,2), min_df=.01, stop_words='english')
fit_vect = vect.fit_transform(X)

# instantiate and fit a dbscan clustering algorithm
ac = AgglomerativeClustering(n_clusters=20)
ac.fit(fit_vect.toarray())

# add the clusters to the original data
df['cluster_num'] = ac.labels_

Wall time: 19.8 s


In [51]:
# review the results
review_clusters(df)

CLUSTER: 0
47     PART III ITEM 10. DIRECTORS AND EXECUTIVE OFFI...
48                       ITEM 11. EXECUTIVE COMPENSATION
49     ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFIC...
50     ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRA...
102                 DOCUMENTS INCORPORATED BY REFERENCE:
Name: section_name, dtype: object

CLUSTER: 1
1                               REDWOOD CITY, CA 94065
2                                      CIK\t0000777676
4                   SECURITIES AND EXCHANGE COMMISSION
5    FORM 10-K [X] ANNUAL REPORT PURSUANT TO SECTIO...
7    ORACLE SYSTEMS CORPORATION 1994 FORM 10-K ANNU...
Name: section_name, dtype: object

CLUSTER: 2
32                            ITEM 3. LEGAL PROCEEDINGS
52    II. AMOUNTS RECEIVABLE FROM RELATED PARTIES AN...
87                                           LITIGATION
94    ORACLE SYSTEMS CORPORATION INDEX OF EXHIBITS A...
95                                    LIST OF ENTITIES:
Name: section_name, dtype: object

CLUSTER: 3
35    PART 