In [1]:
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np


In [2]:
# build a dictionary of all of the files in the 'data' sub-directory

DIRECTORY = 'data/'

# function to get a list of file names of interest
def getRelevantFiles(string):
    directory = [f for f in listdir(DIRECTORY) if isfile(join(DIRECTORY, f))] # all files in given directory 
    files_of_interest = [] # get a sublist of only the relevant files within given directory
    if string == 'pk': # if string == 'pk' get sub-list of elements with pk prefix
        for i in directory:
            if ( i[:2] == 'pk' ):
                files_of_interest.append(i) # 

    elif string == 'dr': # same for dr prefix
        for i in directory:
            if ( i[:2] == 'dr' ):
                files_of_interest.append(i)

    elif string == "": # if string =="", sub-list is all files in directory
        for i in directory:
            files_of_interest.append(i)

    else:
        i = string + '.txt'
        files_of_interest.append(i) # append just this text file. files_of_interest is now just an atomic list
    return files_of_interest


In [4]:
def sort_dictionary( D ): # sort dictionary from most to least frequent entry count
    Ds = sorted(D.items(), key=lambda x:x[1], reverse=True) # DS is a list with length = number of key-value pairs              
    
    return Ds

def stripWord( w ): # strip word of punctuation and convert to all lower-case
    w = w.replace( ".", "" )
    w = w.replace( ",", "" )
    w = w.replace( ";", "" )
    w = w.replace( ":", "" )
    w = w.replace( "'", "" )
    w = w.replace( "&", "" )
    w = w.replace( "\n", "" )
    w = w.lower()
    
    return( w )

def build_dictionary(string): # build dictionary given file specification
    
    files_of_interest = getRelevantFiles(string)
    dictionary = {} 
    N = 0

    for i in files_of_interest:
        myfile = open( 'data/' + i, 'r' )

        for line in myfile: # read one line at a time
            words = line.split(" ") # for each line split the line into individual words that are separated by spaces
            for w in words:
                w = stripWord( w ) #  For each word on each line, remove any punctuation and convert to lower case
                if( len(w) > 0 and w != '—'): # # if the word has a non-zero amount of characters
                    N += 1 # keep track of the total number of words (N) in the document
                    if w in dictionary: 
                        dictionary[w] += 1 # Otherwise, if a word is in your dictionary, then increment its count by 1.
                    else:
                        dictionary[w] = 1 # If it is not in the dictionary, then add it and set the count to 1.
        myfile.close() 
        
    for key, value in dictionary.items(): # convert counts to percentages
        dictionary[key] = dictionary[key] * ( 100/N )

    return dictionary

def sort_dictionary( D ): # sort dictionary from most to least frequent entry count
    Ds = sorted(D.items(), key=lambda x:x[1], reverse=True) # DS is a list with length = number of key-value pairs              
    
    return Ds


In [None]:
def build_dictionary_count(string): # modify build_dictionary to print the frequency count of each key-value pair
    
    files_of_interest = getRelevantFiles(string)
    dictionary = {}
    N = 0

    for i in files_of_interest:
        myfile = open( 'data/' + i, 'r' )

        for line in myfile: # read one line at a time
            words = line.split(" ") # for each line split the line into individual words that are separated by spaces
            for w in words:
                w = stripWord( w ) #  For each word on each line, remove any punctuation and convert to lower case
                if( len(w) > 0 and w != '—'): # # if the word has a non-zero amount of characters
                    N += 1 # keep track of the total number of words (N) in the document
                    if w in dictionary: 
                        dictionary[w] += 1 # Otherwise, if a word is in your dictionary, then increment its count by 1.
                    else:
                        dictionary[w] = 1 # If it is not in the dictionary, then add it and set the count to 1.
        myfile.close() 
        
    return dictionary

def print_dictionary( D ):
    DS = sorted(D.items(), key=lambda x:x[1]) # DS is a list with length = # of key-value pairs
    for k in DS: # for each key-value pair (where k is a tuple of len 2)
        print(k[0], k[1])


In [None]:
# driver code

print(getRelevantFiles('pk')) # getRelevantFiles
print(getRelevantFiles(''))
print(getRelevantFiles('dr2'))
D = build_dictionary("pk") # build_dictionary
sort_dictionary(D) # sort_dictionary
D = build_dictionary_count("pk") # build_dictionary_count
print_dictionary(D)

In [None]:
# build dictionary of the top 25 most frequent words across all 18 pk and dr files in the 'data' sub-directory

D = build_dictionary("") # get all 18 files
top_25 = sort_dictionary(D)[0:25] # sort and get top 25 most frequent words and their frequencies
top_25_words = [] # get top 25 words using output from Q4
for i in range(0, len(top_25)):
    top_25_words.append(top_25[i][0])
    
print(top_25_words)

In [None]:
def populate_matrices(string): # populate 9 by 25 matrix with top 25 words by frequency across all 18 pk and dr files
    
    matrix = np.zeros((9,25))
    
    files_of_interest = getRelevantFiles(string) # get files of interest using helper function
    files_of_interest = [x[:-4] for x in files_of_interest]

    for i in range(0, len(files_of_interest)):
        D = build_dictionary(files_of_interest[i])
        
        for j in range(0, len(top_25_words)): 
            x = top_25_words[j] in D.keys()
            if x == True:
                matrix[i][j] = int(D[top_25_words[j]])
            else:
                matrix[i][j] = int(0)
    return matrix


F1 = populate_matrices('pk')
F2 = populate_matrices('dr')

In [None]:
F1

In [None]:
F2

In [None]:
# Dimensionality reduction and visualization
F   = np.concatenate((F1,F2),axis=0)
pca = PCA(n_components=2)
Fp  = pca.fit(F).transform(F)
plt.scatter( Fp[0:9,0], Fp[0:9,1], color='b')
plt.scatter( Fp[9:18,0], Fp[9:18,1], color='r' )
