/
clean-emails.php
133 lines (103 loc) · 4.1 KB
/
clean-emails.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
<?php
// functions
function debug($msg)
{
echo("> $msg \n");
}
// vars
$file = null;
// filename
if (isset($argv) && is_array($argv) && isset($argv[1]))
{
$file = $argv[1];
}
else
{
debug('ERROR: You need put filename as argument. Ex: clean-emails.php filename.txt');
exit;
}
// file exists
if (!file_exists($file))
{
debug('ERROR: Your file not exists - ' . $file);
exit;
}
// list of emails to remove directly
$patternsToRemove = array('klzlk.com', 'nepwk.com', 'pjjkp.com', 'sharklasers.com', 'mailinator.com', 'emailtemporario.com.br', 'jnxjn.com', 'mailmetrash.com', 'thankyou2010.com', 'trash2009.com', 'mt2009.com', 'trashymail.com', 'mytrashmail.com', '.mailexpire.com', 'mailexpire.com', 'jetable.org', 'tempemail.net', 'spamfree24.org', 'spamspot.com', 'tempalias.com', 'mailcatch.com', 'dsadsa.com', 'trashmail.com', 'africamail.com', 'myself.com');
// list of emails to correct
$patternsToCorrect = array();
$patternsToCorrect[] = array('correct' => 'hotmail.com', 'wrongs' => array(
'hotmai.com', 'homail.com', 'hotmal.com', 'hotimail.com', 'hotmailcom', 'hotmil.com', 'hotmaill.com', 'htomail.com', 'hotmial.com', 'htmail.com', 'hormail.com', 'hotmeil.com', 'rotmail.com', 'HORTMAIL.COM', 'hotail.com', 'otmail.com', 'hotmail.co', 'hotemail.com', 'homtail.com', 'hotmail.om', 'hotrmail.com', 'hoitmail.com', 'hootmail.com', 'hotmailo.com', 'hotmail.com.com', 'hotmail.cm', 'hotmail.con', 'hotmsil.com', 'hoymail.com', 'hotmaio.com', '!hotmail.com', 'hotmail.comn', 'hotmail.br', 'hot.mail.com', 'hotmaol.com', 'hotmail,',
));
$patternsToCorrect[] = array('correct' => 'gmail.com', 'wrongs' => array(
'gmai.com', 'gamil.com', 'gmil.com', 'gmal.com', 'gmailcom', 'gmail,',
));
$patternsToCorrect[] = array('correct' => 'yahoo.com.br', 'wrongs' => array(
'yhaoo.com.br', 'yaho.com.br', 'yaoo.com.br', 'yhoo.com.br', 'yahool.com.br', 'yahoo.com.b', 'yaho.com.br', 'yaoo.com.br', 'yhoo.com.br', 'ahoo.com.br', 'yahoocom.br', 'yahoo.co.br', 'yahoo,',
));
$patternsToCorrect[] = array('correct' => 'yahoo.com', 'wrongs' => array(
'yahool.com', 'yaoo.com', 'yaool.com', 'yaho.com', 'yaoo.com', 'yhoo.com', 'ahoo.com', 'yahoocom', 'yahoo.co',
));
// get number of lines
$linesBefore = shell_exec("cat $file | wc -l");
// process to remove invalid emails
debug('Starting removing invalid emails...');
$qty = 1;
foreach ($patternsToRemove as $index => $pattern)
{
debug('Processing: ' . $qty . ' of ' . count($patternsToRemove) . '...');
$content = shell_exec("cat $file | grep @$pattern");
if ($content)
{
debug('Pattern with results: ' . $pattern);
debug('Removing lines with pattern: ' . $pattern);
shell_exec("sed -i '/@$pattern/d' $file");
debug('Removed!');
}
else
{
debug('Pattern without results: ' . $pattern);
}
$qty++;
}
// process to correct wrong email domains
debug('Starting changing wrong domains...');
$qty = 1;
foreach ($patternsToCorrect as $index => $pattern)
{
debug('Processing: ' . $qty . ' of ' . count($patternsToCorrect) . '...');
$correct = $pattern['correct'];
$wrongs = $pattern['wrongs'];
if (is_array($wrongs) && count($wrongs) > 0)
{
$qtyWrong = 1;
foreach($wrongs as $wrong)
{
debug('Changing the wrong domain: ' . $wrong . ' to ' . $correct . ' - ' . $qtyWrong . ' of ' . count($wrongs) . '...');
$content = shell_exec("cat $file | grep @$wrong");
if ($content)
{
debug('E-mail domain with results: ' . $wrong);
debug('Changing e-mail domain to: ' . $correct);
shell_exec("sed -i 's/@$wrong/@$correct/g' $file");
debug('Changed!');
}
else
{
debug('E-mail domain without results: ' . $wrong);
}
$qtyWrong++;
}
}
else
{
debug('Invalid wrong domains for: ' . $correct);
}
$qty++;
}
// get number of lines
$linesAfter = shell_exec("cat $file | wc -l");
// show summary and result
debug('Number of lines before cleanup: ' . (int)$linesBefore);
debug('Number of lines after cleanup: ' . (int)$linesAfter);
debug('SUCCESS - Your file is clean now!');