# Riot URAP Text Analysis
# Feature Count Generation
### Author : Jaehyun Sim

Import Required libraries

In [1]:
import sys, getopt, os, csv, re
import numpy as np
import random
import shutil
from collections import Counter
from nltk.stem import *

Define all common words and non alphabet characters

In [2]:
non_alphabet_char = [",", ".", "\'", "\"", "\n", "\r", "?", "!", "[", "]", "(", ")", "{", "}", "-", "_", \
                     "#", "~", "`", "@", "$", "%", "^", "&", "*", "+", "=", "<", ">", "/", ":", ";", "|", \
                     "■", "•", "\\", "\ufeff", "“", "’", "®", "©", "—", "”"]

common_words_list = ['a', 'able', 'about', 'across', 'after', 'all',\
                    'almost', 'also', 'am', 'among', 'an', 'and', 'any',\
                    'are','as', 'at', 'be', 'because', 'been', 'but', 'by',\
                    'can', 'cannot', 'could', 'dear', 'did', 'do', 'does',\
                    'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got',\
                    'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his',\
                    'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its',\
                    'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might',\
                    'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off',\
                    'often' ,'on' ,'only', 'or', 'other', 'our', 'own', 'rather',\
                    'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some',\
                    'than', 'that', 'the', 'their', 'them', 'then', 'there',\
                    'these', 'they', 'this', 'tis', 'to', 'too', 'twas', 'us',\
                    'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which',\
                    'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet',\
                    'you', 'your']

Define the path for data

In [3]:
related_data_path = './data/related'
not_related_data_path = './data/not_related'
related_train_data_path = './data/related_train'
related_test_data_path = './data/related_test'
not_related_train_data_path = './data/not_related_train'
not_related_test_data_path = './data/not_related_test'
features_csv = './features.csv'

Define few helper functions

In [4]:
def replacer(line, replacing):
    return ''.join(" " if s in replacing else s for s in line)

def file_counter(source_dir):
    
    txt_file_count = 0
    
    # Go into given directory
    if os.path.isdir(source_dir):

        # Iterate through all text file
        for file in os.listdir(source_dir):
            # Select files that are end with .txt
            if file.endswith(".txt"):
                txt_file_count += 1
   
    return txt_file_count

Load features into workspace

In [5]:
with open(features_csv, 'r') as f:
    reader = csv.reader(f)
    features = list(reader)[0]
f.close()

Define feature occurence counter method

In [8]:
def feature_counter(input_features, source_dir_lst, output_filename):
    
    # Create a counter class for word counts
    word_occurence_counter = {word:0 for word in input_features}
    
    # Initiate a Porter Stemmer Class
    porter_stemmer = PorterStemmer()
    
    # Open output csv file and write the first row
    output_csv = open("./"+ output_filename +".csv","w")
    writer = csv.writer(output_csv, quoting=csv.QUOTE_ALL)
    first_row = input_features + ["_label_"]
    writer.writerow(first_row)
    
    # Iterate through all source directory in source_dir_lst
    for source_dir in source_dir_lst:
        
        # Go into given directory
        if os.path.isdir(source_dir):
        
            print("Counting words for " + source_dir + " directory")
            
            if source_dir in [related_train_data_path, related_test_data_path]:
                curr_label = 1
            if source_dir in [not_related_train_data_path, not_related_test_data_path]:
                curr_label = 0
        
            # Iterate through all text file
            for file in os.listdir(source_dir):
                # Select files that are end with .txt
                if file.endswith(".txt"):
                    # Open the file
                    f = open(source_dir+'/'+file,'r') 
                    # Read in all lines from the file
                    lines = f.readlines()
                    # Converting all letters to lowercase.
                    lines = "".join([x.lower() for x in lines])
                    # Replacing non character words
                    lines = replacer(lines, non_alphabet_char)
                    # Split texts into words by empty space
                    lines = lines.split()
                    # Remove common words
                    lines = [i for i in lines if i not in common_words_list]
                    # Remove any words that are made of 2 letters or less
                    lines = [i for i in lines if len(i)>3]
                    # Apply Porter stemmer for all the words we have
                    lines = [porter_stemmer.stem(word) for word in lines]
                    # Count the occurence of words that are in the feature set
                    curr_count = [0 for i in input_features]
                    
                    for word in lines:
                        if word in input_features:
                            curr_count[input_features.index(word)] += 1
                    
                    curr_row = curr_count + [curr_label]
                    
                    writer.writerow(curr_row)         
                            
                    # Close the file when done with it
                    f.close()
                    
    output_csv.close()       

Now, count the features for trainset

In [9]:
feature_counter(features, [related_train_data_path, not_related_train_data_path], "trainset")

Counting words for ./data/related_train directory
Counting words for ./data/not_related_train directory


Now, count the features for testset

In [10]:
feature_counter(features, [related_test_data_path, not_related_test_data_path], "testset")

Counting words for ./data/related_test directory
Counting words for ./data/not_related_test directory
