# Riot URAP Text Analysis
# Word Feature Generation
### Author : Jaehyun Sim

Import Required libraries

In [43]:
import sys, getopt, os, csv, re
import numpy as np
import random
import shutil
from collections import Counter
from multiprocessing import Pool
from nltk.stem import *

Define all common words and non alphabet characters

In [2]:
non_alphabet_char = [",", ".", "\'", "\"", "\n", "\r", "?", "!", "[", "]", "(", ")", "{", "}", "-", "_", \
                     "#", "~", "`", "@", "$", "%", "^", "&", "*", "+", "=", "<", ">", "/", ":", ";", "|", \
                     "■", "•", "\\", "\ufeff", "“", "’", "®", "©", "—", "”"]

common_words_list = ['a', 'able', 'about', 'across', 'after', 'all',\
                    'almost', 'also', 'am', 'among', 'an', 'and', 'any',\
                    'are','as', 'at', 'be', 'because', 'been', 'but', 'by',\
                    'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',\
                    'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got',\
                    'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his',\
                    'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its',\
                    'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might',\
                    'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off',\
                    'often' ,'on' ,'only', 'or', 'other', 'our', 'own', 'rather',\
                    'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some',\
                    'than', 'that', 'the', 'their', 'them', 'then', 'there',\
                    'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us',\
                    'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which',\
                    'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet',\
                    'you', 'your']

In [3]:
# To Peform feature generation, follow these steps

# I) Create word features for each documents
# for each documents:  
#     i) Break the whole text into words
#     ii) Change all words to lower cases
#     iii) Remove all punctuations and non alphabet characters
#     iv) Apply Porter stemmer to each words
    
# II) Count the number of occurrences of each word in the whole set of documents 
#     and discard stems that do not occur in 1.5% of whole set of documents
#     and also discard stems that occur in over 90% of whole set of documents

First, define the path for data

In [30]:
related_data_path = './data/related'
not_related_data_path = './data/not_related'
sampled_not_related_data_path = './data/sampled_not_related'

Define few helper functions

In [5]:
def replacer(line, replacing):
    return ''.join(" " if s in replacing else s for s in line)

In [32]:
def file_counter(source_dir):
    
    txt_file_count = 0
    
    # Go into given directory
    if os.path.isdir(source_dir):

        # Iterate through all text file
        for file in os.listdir(source_dir):
            # Select files that are end with .txt
            if file.endswith(".txt"):
                txt_file_count += 1
   
    return txt_file_count

In [6]:
def feature_selector(source_dir_lst):
    
    # Create a counter class for word counts
    word_occurence_counter = Counter()
    
    # Initiate a Porter Stemmer Class
    porter_stemmer = PorterStemmer()
    
    # Iterate through all source directory in source_dir_lst
    for source_dir in source_dir_lst:
        
        # Go into given directory
        if os.path.isdir(source_dir):
        
            print("Selecting words for " + source_dir + " directory")
        
            # Iterate through all text file
            for file in os.listdir(source_dir):
                # Select files that are end with .txt
                if file.endswith(".txt"):
                    # Open the file
                    f = open(source_dir+'/'+file,'r') 
                    # Read in all lines from the file
                    lines = f.readlines()
                    # Converting all letters to lowercase.
                    lines = "".join([x.lower() for x in lines])
                    # Replacing non character words
                    lines = replacer(lines, non_alphabet_char)
                    # Split texts into words by empty space
                    lines = lines.split()
                    # Remove common words
                    lines = [i for i in lines if i not in common_words_list]
                    # Remove any words that are made of 2 letters or less
                    lines = [i for i in lines if len(i)>3]
                    # Apply Porter stemmer for all the words we have
                    lines = [porter_stemmer.stem(word) for word in lines]
                    # Add current occurence of words to the total occurence counter
                    word_occurence_counter = word_occurence_counter + Counter({word:1 for word in Counter(lines)})
                    # Close the file when done with it
                    f.close()
           
    return word_occurence_counter

In [85]:
def occurence_checker(curr_counter, source_dir_lst):
    
    txt_file_count = 0
    
    # Iterate through all source directory in source_dir_lst to count the number of txt files
    for source_dir in source_dir_lst:
        
        # Go into given directory
        if os.path.isdir(source_dir):
                
            # Iterate through all text file
            for file in os.listdir(source_dir):
                # Select files that are end with .txt
                if file.endswith(".txt"):
                    txt_file_count += 1

    too_small_count = round((txt_file_count/200)*3)
    too_many_count = round((txt_file_count/10)*9)
    
    return [word for word in curr_counter if curr_counter[word]>too_small_count and curr_counter[word]<too_many_count]

Since we have only 149 related documents where the number of unrelated documents is over 20000, we randomly select 500 unrelated documents to create a feature set.  
To do this, create a method that generates random indexes within given range.

In [35]:
def random_index(numb_ind, start_range, end_range):
    return random.sample(range(start_range, end_range), numb_ind) 

Also, define a method that copies sample file into new directory with given a list of indexes.

In [52]:
def make_new_unrelated_dir(unrelated_source_dir, new_source_dir, index_lst):
    
    count = 0
    
    # Delete existing new source directory
    if os.path.exists(new_source_dir): 
        shutil.rmtree(new_source_dir)
    
    # Create a new directory
    os.makedirs(new_source_dir)
        
    # Go into given directory
    if os.path.isdir(unrelated_source_dir):

        # Iterate through all text file
        for file in os.listdir(unrelated_source_dir):
            # Select files that are end with .txt and check if it is in index_lst
            if file.endswith(".txt") and count in index_lst:
                curr_path = unrelated_source_dir+'/'+file
                new_path = new_source_dir+'/'+file
                shutil.copyfile(curr_path, new_path)
                
            count += 1

Now create a new not-related txt directory with sample indexes

In [76]:
sample_index = random_index(500, 0, file_counter(not_related_data_path))
make_new_unrelated_dir(not_related_data_path, sampled_not_related_data_path, sample_index)

Finally, we generate the feature set

In [78]:
# First, perform step I)
before_trimmed = feature_selector([related_data_path, sampled_not_related_data_path])

Selecting words for ./data/related directory
Selecting words for ./data/sampled_not_related directory


In [86]:
# First, perform step II)
after_trimmed = occurence_checker(before_trimmed, [related_data_path, sampled_not_related_data_path])

Check how large is the feature set. Acceptable size is between 2500 and 4000

In [92]:
len(after_trimmed)

3108

Now output the feature set as csv file

In [93]:
output_csv = open("./features.csv","w")
writer = csv.writer(output_csv, quoting=csv.QUOTE_ALL)
writer.writerow(after_trimmed)
output_csv.close()