-
Notifications
You must be signed in to change notification settings - Fork 4
/
preprocessing.py
153 lines (95 loc) · 4.12 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 1 16:28:18 2018
@author: Sahit
"""
import re
import glob
import nltk
from multiprocessing import Process, Queue
folder = nltk.data.find(
'speeches/manmohansingh/')
paths = glob.glob('speeches/manmohansingh/*')
class preprocessor(object):
def __init__(self, paths, folder):
self.paths = paths
self.speeches_data = []
self.data_folder = folder
self.words_a= []
def individual_processor(self, path, q):
'''
We define an individual preprocessor which reads each file separartly
this then extracts the number of sentences, the words and the sentences of the file
We then use the file name to extract the place of the speech, and the date with the help of regex
'''
corpusReader = nltk.corpus.PlaintextCorpusReader(
self.data_folder,
path.split('/')[-1])
number_of_sentences = len(corpusReader.sents())
number_of_words = len(
[word for sentence in corpusReader.sents() for word in sentence])
place = re.search(r'_([a-zA-Z]+)_.txt$', path)
date = re.search( r'/Users/Sahit/Documents/GitHub/BBC_WorkSpace/PM_Speech_Analysis/Speeches/Speeches_Modi_Demo/\d+_(\d+)_([a-zA-Z]+)_(\d+)', path)
#date = re.search(r'^[0-9]+_(\w+)_(\w+)_(\d+)', path)
#city = re.search(r'(\D+).txt$')
filename = path.split('/')[-1]
data = self.data_extracter(path)
self.words_a.extend(data.split())
speech = {
'data': data,
'filename':filename,
'word_count':number_of_words,
'sentence_count': number_of_sentences,
'average_sentence_length': number_of_words / number_of_sentences
}
try:
speech['city'] = place.group(1)
except:
speech['city'] = 'NA'
date_ = ' '.join([date.group(1), date.group(2), date.group(3)])
day = date.group(1)
month = date.group(2)
year = date.group(3)
speech['date'] = date_
speech['day'] = day
speech['month'] = month
speech['year'] = year
q.put(speech)
def data_extracter(self, path):
data = ''
for word in self.get_dictionary_word_list(path):
if self.language_identifier(word):
data += word
return data
def language_identifier(self, line):
'''
we convert each word into ascii values and see if the following values comes in
the range of the english alphabets and numericals
'''
maxchar = ord(max(line))
if 65 <= maxchar <= 90:
return 1
elif 97 <= maxchar <= 122:
return 1
else:
return 0
def get_dictionary_word_list(self, filepath):
with open(filepath) as f:
# return the split results, which is all the words in the file.
return f.read().split()
def Multi_Processor(self, paths):
'''
We apply Python's multi processing capability to speed to process
of pre-processing almost 1500 files. We use the Process function which spawns individual
Threads
'''
for path in paths:
q = Queue
z = preprocessor()
p = Process(target=z.individual_processor, args=(path, q))
p.start()
p.join()
l = q.get()
self.speeches_data.append(l)
return self.speeches_data