In [37]:
import csv
import re
import copy
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from collections import defaultdict as dd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

In [147]:
rawData = []
processedData = []
melb = re.compile(r'\w*melb\w*')
bris = re.compile(r'\w*bris\w*')
pert = re.compile(r'\w*pert\w*')
sydn = re.compile(r'\w*sydn\w*')
englishStemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))
tknzr = TweetTokenizer(strip_handles=False)
dataset = 'train'

with open('../2019S1-proj2-data/{}-raw.tsv'.format(dataset), newline = '') as tsvfile:
    rd = csv.reader(tsvfile, delimiter="\t", quotechar='"', doublequote=False)
    for row in rd:
        rawData.append(row)


# filter all the stuff here
for instance in rawData:
    temp = '{!r}'.format(instance[2])
    temp = re.sub(r'\'','',temp)
    instance[2] = temp.split()
    
    # filter the word here
    newList = []
    for word in instance[2]:
        # do the filter here
        
        # if in stop word skip
        if(word in stop_words):
            continue
        
        # Strip @****
        temp = tknzr.tokenize(word)
        
        if(temp == []):
            continue
        else:
            temp = temp[0]
        
        # convert all to lowercase
        temp = temp.lower()
        
        if(melb.search(temp)):
            newList.append("melbourne")
        if(bris.search(temp)):
            newList.append("brisbane")
        if(pert.search(temp)):
            newList.append("perth")
        if(sydn.search(temp)):
            newList.append("sydney")
        
        # throw website
        temp = re.sub(r'^http.*','',temp)
        # strip all special symbol
        temp = re.sub(r'[[-`!-/:-@{-~]','',temp)
        # throw repetition of letter
        temp = re.sub(r'(.)(\1{4,}).*','',temp)
        # stem the word using nltk english Stemmer
#         temp = englishStemmer.stem(temp)
        # throw only numeric text
        temp = re.sub(r'^\d{1,}$','',temp)
        # throw non alphabetic text
        temp = re.sub(r'\w*\d\w*','',temp)

        # add to the new list
        if(temp != '' and len(temp) > 1):
            newList.append(temp)

    instance[2] = newList
    processedData.append(instance)

In [139]:
# add the word to dictionary {word:{instance_ID:"count"}}
word_count = {}
for instance in processedData:
    instance_ID = instance[0]
    for word in instance[2]:
        if(word not in word_count):
            word_count[word] = dd(int)
        word_count[word][instance_ID] += 1
        

In [140]:
# throw the word that occurs 3 times or less
duplicate_word_count = copy.deepcopy(word_count)
for key, val in word_count.items():
#     if(len(val) <= 3):
    if(len(val) <= 10):
        del duplicate_word_count[key]

In [141]:
# list for dict vectorizer [{word:"count"}]
word_count_for_vec = []
label = []
for instance in processedData:
    label.append(instance[1])
    word_dict = dd(int)
    for word in instance[2]:
        word_dict[word] += 1
    word_count_for_vec.append(word_dict)

final_word_vec = copy.deepcopy(word_count_for_vec)

index = 0
for instance in word_count_for_vec:
    for key, val in instance.items():
        if(key not in duplicate_word_count):
            del final_word_vec[index][key]
    index += 1

In [142]:
# vectorizer
v = DictVectorizer()
X = v.fit_transform(final_word_vec)

In [126]:
# do chi2 feature selection only for train dataset
x10train = SelectKBest(chi2, k=34)
X10_train = x10train.fit_transform(X,label)

x50train = SelectKBest(chi2, k=185)
X50_train = x50train.fit_transform(X,label)

x100train = SelectKBest(chi2, k=700)
X100_train = x100train.fit_transform(X,label)

In [127]:
titles = ['top10','top50','top100']
selectors = [x10train, x50train, x100train]
featuresList = []

for selector in selectors:
    feat = []
    for feat_num in selector.get_support(indices=True):
        feat.append(v.get_feature_names()[feat_num])
    featuresList.append(feat)

In [128]:
for (title, selector, features) in zip(titles, selectors, featuresList):
    # get the features
#     line = "Instance_ID"
#     for feature in features:
#         line += ",{}".format(v.get_feature_names()[feat_num])
    
#     line += ",Location"
    with open('../preprocessed/{}{}.csv'.format(title,dataset),"w+") as file:
#         file.write(line)
#         file.write('\n')
        index = 0
        for instance in final_word_vec:
            line = "{}".format(index+1)
            for feature in features:
                if(feature in instance):
                    line += ",{}".format(instance[feature])
                else:
                    line += ",0"
            
            line += ",{}".format(label[index])
            file.write(line)
            file.write('\n')
            index += 1


In [143]:
featuresList

[['annemarie',
  'balmainwpc',
  'barometer',
  'brisbane',
  'dwpcdevils',
  'franking',
  'freo',
  'freodockers',
  'gemmatognini',
  'hpa',
  'humidity',
  'km',
  'melbourne',
  'mm',
  'nsw',
  'ovoawl',
  'perth',
  'qld',
  'queensland',
  'rain',
  'sydney',
  'tax',
  'temperature',
  'vic',
  'victoria',
  'victraffic',
  'voodoo',
  'wa',
  'wales',
  'waterpoloa',
  'waterpoloaus',
  'waterpolosa',
  'western',
  'wind'],
 ['abcbrisbane',
  'advanceqld',
  'afdonnerwetter',
  'afl',
  'afleaglesfreo',
  'afleaglesgiants',
  'aflfantasy',
  'aflpieseagles',
  'aftersch',
  'alert',
  'alexmatthewsar',
  'ang',
  'annemarie',
  'anthonyqld',
  'assessed',
  'astrongeroz',
  'ausconservation',
  'auspol',
  'australia',
  'balmainwpc',
  'barometer',
  'belungerer',
  'bitdturtle',
  'bitw',
  'bluegroperdes',
  'bondi',
  'brisbane',
  'broncos',
  'brookeshields',
  'btstwt',
  'bud',
  'callmesky',
  'carodirusso',
  'cellarbrations',
  'chronicfinders',
  'closed',
  'com

In [144]:
dataset

'test'

In [145]:
# for making dev and test
for (title, selector, features) in zip(titles, selectors, featuresList):

#     line = "Instance_ID"
#     for feature in features:
#         line += ",{}".format(feature)
    
#     line += ",Location"
    with open('../preprocessed/{}{}.csv'.format(title,dataset),"w+") as file:
#         file.write(line)
#         file.write('\n')
        index = 0
        for instance in final_word_vec:
            line = "{}".format(index+1)
            for feature in features:
                if(feature in instance):
                    line += ",{}".format(instance[feature])
                else:
                    line += ",0"
            
            line += ",{}".format(label[index])
            file.write(line)
            file.write('\n')
            index += 1
