-
Notifications
You must be signed in to change notification settings - Fork 0
/
02 - Preprocessing.cpp
285 lines (251 loc) · 10.2 KB
/
02 - Preprocessing.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#include<iostream>
#include<cctype> //for isalpha() function
#include<vector> //for vector class
#include<sstream> //for stringstream class
#include<unordered_set> //for unordered_set
#include "C:\Users\Prakhar Jadaun\Desktop\Study\5th sem\Project\Main\Final Analysis\Feature Extraction\OleanderStemmingLibrary-master\include\olestem\stemming\english_stem.h"
//class preprocessing to perform the preprocessing steps
#pragma once
class Preprocessing
{
private:
std::string sub;
std::string mes;
public:
//function to check if the passed character is special or not
bool checkSpecialChar(char ch);
//for removing the special characters
std::pair<std::string,std::string> removeSpecialChar(std::pair<std::string,std::string> val);
//lowering case
std::pair<std::string,std::string> lowercaseConversion(std::pair<std::string,std::string> val);
//tokenization
std::pair<std::vector<std::string>,std::vector<std::string>> tokenize(std::pair<std::string,std::string> val);
//function to check whether the passed string is a stop word or not
bool checkStopWords(std::string s);
//removal of stop words
std::pair<std::vector<std::string>,std::vector<std::string>> removalOfStopWords(std::pair<std::vector<std::string>,std::vector<std::string>> tokens);
//removal of hyperlinks
//stemming
std::vector<std::string> stemWords(std::pair<std::vector<std::string>,std::vector<std::string>> val);
//lemmitization
};
//function to check if the character passed to the function is a special character or not
//!"#$%&'()*+,-./:;< = >?@[\]^_`{|}~
//removing the numbers from our dataset
bool Preprocessing::checkSpecialChar(char ch)
{
//switch case
switch(ch)
{
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '!':
case '\"':
case '#':
case '$':
case '%':
case '&':
case '\'':
case '(':
case ')':
case '*':
case '+':
case ',':
case '-':
case '.':
case '/':
case ':':
case ';':
case '<':
case '=':
case '>':
case '?':
case '@':
case '[':
case '\\':
case ']':
case '^':
case '_':
case '`':
case '{':
case '|':
case '}':
case '~':
return true;
default :
return false;
}
}
//function to remove special characters from the subject and message column
std::pair<std::string,std::string> Preprocessing::removeSpecialChar(std::pair<std::string,std::string> val)
{
//assigning the first and second values to sub and mes
sub = val.first;
mes = val.second;
//iterating all the characters of the subject
for(int i=0;i<sub.size();i++)
{
//checking if the character is special or not
if(checkSpecialChar(sub[i]))
{
//assigning ' ' at the indices of special character
sub[i] = ' ';
}
}
//iterating all the characters of the message
for(int i=0;i<mes.size();i++)
{
//checking if the characters is special or not
if(checkSpecialChar(mes[i]))
{
//assigning ' ' at the indices of special character
mes[i] = ' ';
}
}
//assigning the updated subject and message
val.first = sub;
val.second = mes;
//returning the pair val
return val;
}
//function to convert all characters into lowercase
std::pair<std::string,std::string> Preprocessing::lowercaseConversion(std::pair<std::string,std::string> val)
{
sub = val.first;
mes = val.second;
//iterating all the characters of the subject
for(int i=0;i<sub.size();i++)
{
//checking if the character is alphabet or not
if(isalpha(sub[i]))
{
//checking if the alphabet is in uppercase or not
if(isupper(sub[i]))
{
//converting the alphabet to lowercase
sub[i] = tolower(sub[i]);
}
}
}
//iterating all the characters of the message
for(int i=0;i<mes.size();i++)
{
//checking if the character is alphabet or not
if(isalpha(mes[i]))
{
//checking if the alphabet is in uppercase or not
if(isupper(mes[i]))
{
//converting the alphabet to lowercase
mes[i] = tolower(mes[i]);
}
}
}
//assigning the updated subject and message
val.first = sub;
val.second = mes;
//returning the pair val
return val;
}
//function to tokenize the subject and the message of the dataset
std::pair<std::vector<std::string>,std::vector<std::string>> Preprocessing::tokenize(std::pair<std::string,std::string> val)
{
std::vector<std::string> s1,s2;
std::stringstream c1(val.first),c2(val.second);
std::string temp;
//tokenizing the subject of the mail and storing it into the vector s1
while(getline(c1,temp,' '))
{
s1.push_back(temp);
}
//tokenizing the message of the mail and storing it into the vector s2
while(getline(c2,temp,' '))
{
s2.push_back(temp);
}
//declaring a pair containing tokenized subject and message as vectors of string
std::pair<std::vector<std::string>,std::vector<std::string>> val2(s1,s2);
return val2;
}
/*
i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mustn't, needn, needn't, shan, shan't, shouldn, shouldn't, wasn, wasn't, weren, weren't, won, won't, wouldn, wouldn't
*/
//function to return whether the passed string is a stop word or not
bool Preprocessing::checkStopWords(std::string s)
{
std::unordered_set<std::string> stopwords = {"i","me","my","myself","we","our","ours","ourselves","you","you\'re","you\'ve","you\'ll","you\'d","your","yours","yourself","yourselves","he","him","his","himself","she","she\'s","her","hers","herself","it","it\'s","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that\'ll","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","don\'t","should","should\'ve","now","d","ll","m","o","re","ve","y","ain","aren","aren\'t","couldn","couldn\'t","didn","didn\'t","doesn","doesn\'t","hadn","hadn\'t","hasn","hasn\'t","haven","haven\'t","isn","isn\'t","ma","mightn","mightn\'t","mustn","mustn\'t","needn","needn\'t","shan","shan\'t","shouldn","shouldn\'t","wasn","wasn\'t","weren","weren\'t","won","won\'t","wouldn","wouldn\'t"};
if(stopwords.find(s)!=stopwords.end())
return true;
return false;
}
//function to remove stop words from the dataset
std::pair<std::vector<std::string>,std::vector<std::string>> Preprocessing::removalOfStopWords(std::pair<std::vector<std::string>,std::vector<std::string>> tokens)
{
std::vector<std::string> s,m;
s = tokens.first;
m = tokens.second;
for(int i=0;i<s.size();i++)
{
if(checkStopWords(s[i]))
{
s.erase(s.begin()+i);
}
}
for(int i=0;i<m.size();i++)
{
if(checkStopWords(m[i]))
{
m.erase(m.begin()+i);
}
}
tokens.first = s;
tokens.second = m;
return tokens;
}
//function to stem words of the subject and message section
std::vector<std::string> Preprocessing::stemWords(std::pair<std::vector<std::string>,std::vector<std::string>> val)
{
std::vector<std::string> s,m;
s = val.first;
m = val.second;
//definning two unordered_set named set1 and set2 of string type
std::unordered_set<std::string> tempSet;
stemming::english_stem<> StemEnglish;
for(int i=0;i<s.size();i++)
{
//storing the string in a temporary variable
std::string temp = s[i];
//converting string to wstring type
std::wstring word(temp.begin(),temp.end());
StemEnglish(word);
std::string temp2(word.begin(),word.end());
//inserting the stemmed string into the set
tempSet.insert(temp2);
}
for(int i=0;i<m.size();i++)
{
//storing the string in a temporary variable
std::string temp = m[i];
//converting string to wstring type
std::wstring word(temp.begin(),temp.end());
StemEnglish(word);
std::string temp2(word.begin(),word.end());
//inserting the stemmed string into the set
tempSet.insert(temp2);
}
std::vector<std::string> vStem;
std::unordered_set<std::string>::iterator it;
for(it = tempSet.begin();it!=tempSet.end();it++)
{
vStem.push_back(*it);
}
return vStem;
}