-
Notifications
You must be signed in to change notification settings - Fork 0
/
perl_preprocess_dataset.pl
72 lines (69 loc) · 1.7 KB
/
perl_preprocess_dataset.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Sample preprocessing code, change or update according to the need
# change below file name to according to your need
open RREAD,'<dataset.txt';
@lines = <RREAD>;
close RREAD;
for $ln (@lines){
chomp $ln;
$ln = lc($ln);
$ln =~ s/\ / /g;
$ln =~ s/\<[^>]*\>/ /g;
$ln =~ s/<[^>]+\/ >/ /g;
$ln =~ s/\<\/[^>]*\>/ /g;
$ln =~ s/<img[^>]*\>/ /g;
$ln =~ s/"/ /g;
$ln =~ s/"/ /g;
$ln =~ s/quot;/ /g;
$ln =~ s/&/ /g;
$ln =~ s/&/ /g;
$ln =~ s/amp;/ /g;
$ln =~ s/\<!-- rich text --\>//g;
$ln =~ s/\'/'/g;
$ln =~ s/\>/ > /g;
$ln =~ s/gt;/ > /g;
$ln =~ s/\</ < /g;
$ln =~ s/lt;/ < /g;
$ln =~ s/\<[^>]*\>/ /g;
$ln =~ s/<[^>]+\/\s+>/ /g;
$ln =~ s/\<br \/\>/ /g;
$ln =~ s/\(/ \( /g;
$ln =~ s/\)/ \) /g;
$ln =~ s/([a-zA-Z]):/\1 :/g;
$ln =~ s/:([a-zA-Z])/: \1/g;
$ln =~ s/[\d]+:[\d]+:[\d]+/ /g;
$ln =~ s/[\d]+\/[\d]+\/[\d]+/ /g;
$ln =~ s/[-]+/ - /g;
$ln =~ s/[=]+/ = /g;
$ln =~ s/aren't/are not/g;
$ln =~ s/can't/cannot/g;
$ln =~ s/didn't/did not/g;
$ln =~ s/doesn't/does not/g;
$ln =~ s/don't/do not/g;
$ln =~ s/haven't/have not/g;
$ln =~ s/here's/here is/g;
$ln =~ s/isn't/is not/;
$ln =~ s/it's/it is/g;
$ln =~ s/i've/i have/g;
$ln =~ s/shouldn't/should not/g;
$ln =~ s/we're/we are/g;
$ln =~ s/weren't/were not/g;
$ln =~ s/we've/we have/g;
$ln =~ s/won't/will not/g;
$ln =~ s/what's/what is/g;
$ln =~ s/you'll/you will/g;
$ln =~ s/sync'd/synced/g;
$ln =~ s/i'd/i would/g;
$ln =~ s/t'est/ /g;
$ln =~ s/'s/ /g;
$ln =~ s/they're/they are/g;
$ln =~ s/isn't/is not/;
$ln =~ s/isn'r/is not/;
$ln =~ s/<img.*//;
$ln =~ s/([a-z0-9])'/\1 '/g;
$ln =~ s/'([a-z0-9])/' \1/g;
$ln =~ s/ +/ /g;
$ln =~ s/"//g;
$ln =~ s/^\s+//g;
$ln =~ s/\s+$//g;
print $ln,"\n";
}