-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
147 lines (78 loc) · 3.06 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics, ensemble, linear_model, svm
from numpy import log, ones, array, zeros, mean, std, repeat
import numpy as np
import scipy.sparse as sp
import re
import csv
from time import time
DIR_PATH = ""
TRAIN_FILE = DIR_PATH + "train.csv"
BADWORDS_FILE = DIR_PATH + "badwords.txt" # attached with submission
def normalize(f):
f = [x.lower() for x in f]
f = [x.replace("\\n"," ") for x in f]
f = [x.replace("\\t"," ") for x in f]
f = [x.replace("\\xa0"," ") for x in f]
f = [x.replace("\\xc2"," ") for x in f]
#f = [x.replace(","," ").replace("."," ").replace(" ", " ") for x in f]
#f = [x.replace(" "," ") for x in f]
f = [x.replace(" u "," you ") for x in f]
f = [x.replace(" em "," them ") for x in f]
f = [x.replace(" da "," the ") for x in f]
f = [x.replace(" yo "," you ") for x in f]
f = [x.replace(" ur "," you ") for x in f]
#f = [x.replace(" ur "," your ") for x in f]
#f = [x.replace(" ur "," you're ") for x in f]
f = [x.replace("won't", "will not") for x in f]
f = [x.replace("can't", "cannot") for x in f]
f = [x.replace("i'm", "i am") for x in f]
f = [x.replace(" im ", " i am ") for x in f]
f = [x.replace("ain't", "is not") for x in f]
f = [x.replace("'ll", " will") for x in f]
f = [x.replace("'t", " not") for x in f]
f = [x.replace("'ve", " have") for x in f]
f = [x.replace("'s", " is") for x in f]
f = [x.replace("'re", " are") for x in f]
f = [x.replace("'d", " would") for x in f]
#f = [x.replace("outta", "out of") for x in f]
bwMap = loadBW()
for key, value in bwMap.items():
kpad = " " + key + " "
vpad = " " + value + " "
f = [x.replace(kpad, vpad) for x in f]
# stemming
f = [re.subn("ies( |$)", "y ", x)[0].strip() for x in f]
f = [re.subn("s( |$)", " ", x)[0].strip() for x in f]
f = [re.subn("ing( |$)", " ", x)[0].strip() for x in f]
f = [re.subn(" [*$%&#@][*$%&#@]+"," xexp ", x)[0].strip() for x in f]
f = [re.subn(" [0-9]+ "," DD ", x)[0].strip() for x in f]
return f
def loadBW():
f = open(BADWORDS_FILE, "r")
bwMap = dict()
for line in f:
sp = line.strip().lower().split(",")
if len(sp) == 2:
bwMap[sp[0].strip()] = sp[1].strip()
return bwMap
def readCsv(fname, skipFirst=True, delimiter = ","):
reader = csv.reader(open(fname),delimiter=delimiter)
rows = []
count = 1
for row in reader:
if not skipFirst or count > 1:
rows.append(row)
count += 1
return rows
train_data = readCsv(TRAIN_FILE)
train = [x[2] for x in train_data]
f = normalize(train)
print(train[166])
#print(normalize(train[166]))
#print('\n\n\n\n\n\n')
print(f[166])
#vectorizer = CountVectorizer(max_n=mx,min_n=mn,binary=True)
#X_train = vectorizer.fit_transform(f)
#print(X_train)