# Cleaning the Dataset

Thinks which should be removed: 

1. Lower-Case everything
2. Abstract (If first Sentence starts with Abstract/Abstarct)
3. Remove Numbers 
4. Remove Whitespaces
4. Remove Abr. 
5. Remove et.al 
6. Remove Sentences with to much noise -> Maybe a function
7. Remove HTTPs: 
8. Remove Empty Doc. 
9. Remove Docs with less then XXX Characters

In [1]:
import re
import os
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize

In [2]:
stopWords = set(stopwords.words('english'))

## RegEx Expressions

In [3]:
re_abstract = re.compile(r"^abstract |Abstract |Summary |summary ")
re_single_chars = re.compile(r'\b\S{1,2}\b') # Use this only if W not in Stop_words
re_numb = re.compile(r"\s\d+\s") # Single standing numbers
re_unkown = re.compile(r"(\s+\S+\d+\S*)|(\s+\S*\d+\S+)") #Chars followed by numb or numb followed by chars
re_formula = re.compile(r"(\w*\W+\d+)") # Formulas without chars
re_whitespace = re.compile(r"/^\s+|\s+$|\s+(?=\s)/g") # Remove Whitespace of everything
re_bracelets = re.compile(r"\([^)]*\)")# Remove Bracelets
re_path = re.compile(r"(\\\\?([^\\/]*[\\/])*)([^\\/]+)$")
re_url = re.compile(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)")

#TODO 
- Multiple following one_letter_words
formula in the end

## Helper Functions

In [4]:
def make_dir(path:str):
    if not os.path.isdir(path):
        os.mkdir(path)

In [5]:
# lower case the whole text
def lower_case(text:str) -> str:
    text_return = ""
    for word in text:
        text_return += word.lower()
    return text_return

In [6]:
def remove_2_grams(text:str) -> str:
    text_return = ""
    for word in word_tokenize(text):
        if word in stopWords:
            if len(word) == 1 and word not in ["a", "i", "o"]:
                text_return += "<var> "
            else:
                text_return += word+" "
        else:
            if len(word) > 2:
                text_return += word+" "
    return text_return

In [7]:
# Dont forget to delete the last element when sentence tokenizer because of SATZZeichen
def calculate_noise(text:str) -> float:
    noisy_characters = re.findall(r"(\b[a-z]+)", text) # All Characters (Only lower cased)
    #print(noisy_characters)
    # find all characters which are not one-letter-word
    one_letter_words = ["i", "a", "o"]
    not_noisy_characters = [x for x in noisy_characters if x in one_letter_words or len(x) > 1]
    #print(len(not_noisy_characters))
    #print(len(word_tokenize(text)))
    return 1 - (len(not_noisy_characters)/len(word_tokenize(text)))

In [8]:
def clean_txt_from_noise(txt:str, noise_threshold = 0.45)->str:
    text = ""
    sentences = sent_tokenize(txt)
    for sentence in sentences: 
        sentence_txt = "".join(sentence)
        #print(sentence_txt)
        #print(calculate_noise(sentence_txt))
        if calculate_noise(sentence_txt) < noise_threshold: 
            text += sentence_txt
    return text

In [9]:
def delete_files_with_unk(txt:str, percentage:float = 0.35)->str:
    text = ""
    sentences = sent_tokenize(txt)
    for sentence in sentences: 
        _len = sentence.count("<unknown>")
        _words = re.findall(r"(\S+)", sentence)
        if _len/len(_words) < percentage:
            text += sentence
    return text

In [10]:
def cleaning_pipe(txt:str)->str:
    text = lower_case(txt)
    # remove paths
    text = re.sub(re_path, "", text)
    text = re.sub(re_url, "", text)
    # Delete all the sentences with to much noise
    text = clean_txt_from_noise(txt)
    # Applying Regex
    text = re.sub(re_bracelets, "", text)
    text = remove_2_grams(text)
    text = re.sub(re_abstract,"",text)
    text = re.sub(re_numb," <unknown> ",text)
    text = re.sub(re_unkown," <unknown> ",text)
    text = re.sub(re_whitespace,"",text)
    text = lower_case(text)
    text = delete_files_with_unk(text)
    return text

In [11]:
def check_len_txt(txt:str, threshold:int = 7) -> bool:
    noisy_characters = re.findall(r"(\b[a-z]+)", txt)
    one_letter_words = ["i", "a", "o"]
    not_noisy_characters = [x for x in noisy_characters if x in one_letter_words or len(x) > 1]
    if len(not_noisy_characters) >= 7:
        return True
    else:
        return False

## Playground

In [12]:
#text = "Summary The paper presents == the results on sequential sowing experiments carried out as a part of the international Theme 1 (Initial experiments) within Photosynthesis programme of IBP/PP. The net assimilation rate (E) and relative growth rate (R) were determined from weekly increments of the total biomass and leaf area in young sunflower plants (variety Pole Star) cultivated during four growing seasons (1967–1970). For fitting the changes in E and R multiple regression was used: E R = E ¯ R ¯ + a 1 T − T ¯ + a 2 S − S ¯ + a 3 F − F ¯ where T means daily air temperature, S – sum of solar radiation (PhAR), F – leaf area ratio and ai – parameters of the regression equation. The significance of single terms in regression equation is tested and their differences in individual years analysed. The more pronounced constancy of prediction is to be found in net assimilation rate than in relative growth rate. The possibilities of generalizing these conclusions are discussed."

In [13]:
text = '''
              serial JL 273226 291210 291723 291743 291782 31 Journal of Verbal Learning and Verbal Behavior JOURNALVERBALLEARNINGVERBALBEHAVIOR 2005-09-14 2005-09-14 2010-04-03T07:14:31 1-s2.0-S0022537166800154 S0022-5371(66)80015-4 S0022537166800154 10.1016/S0022-5371(66)80015-4 S350 S350.1 HEAD-AND-TAIL 1-s2.0-S0022537166X80019 2015-05-14T01:53:18.432515-04:00 0 0 19660401 19660430 1966 2005-09-14T00:00:00Z rawtext articleinfo crossmark dco dateupdated tomb dateloaded datesearch indexeddate issuelist volumelist yearnav articletitlenorm authfirstinitialnorm authfirstsurnamenorm cid cids contenttype copyright dateloadedtxt docsubtype doctype doi eid ewtransactionid hubeid issfirst issn issnnorm itemstage itemtransactionid itemweight openaccess openarchive pg pgfirst pglast pii piinorm pubdateend pubdatestart pubdatetxt pubyr sectiontitle sortorder srctitle srctitlenorm srctype volfirst volissue webpdf webpdfpagecount affil articletitle auth authfirstini authfull authlast footnotes primabst pubtype ref alllist content subj ssids 0022-5371 00225371 5 5 2 2 Volume 5, Issue 2 15 188 192 188 192 196604 April 1966 1966-04-01 1966-04-30 1966 Symposium On Coding And Conceptual Processes In Verbal Learning article fla Copyright © 1966 Published by Elsevier Inc. EVIDENCEFORCODINGPROCESSESDERIVEDCLUSTERINGINFREERECALL COFER C COFER 1965 261 272 C COFER 1965 386 389 C COHEN 1963 368 376 B COHEN 1963 227 234 B DEESE 1959 305 312 J GLANZER 1963 289 299 M GLANZER 1963 301 309 M GONZALEZ 1959 293 320 R MARSHALL 1963 G ORGANIZATIONVERBALMATERIALINFREERECALLEFFECTSPATTERNSASSOCIATIVEOVERLAPCLUSTERING MARSHALL 1963 408 421 G MILLER 1956 81 97 G COFERX1966X188 COFERX1966X188X192 COFERX1966X188XC COFERX1966X188X192XC item S0022-5371(66)80015-4 S0022537166800154 1-s2.0-S0022537166800154 10.1016/S0022-5371(66)80015-4 273226 2010-10-15T10:13:49.671538-04:00 1966-04-01 1966-04-30 1-s2.0-S0022537166800154-main.pdf https://s3.amazonaws.com/prod-ucs-content-store-us-east/content/pii:S0022537166800154/MAIN/application/pdf/940ca9f2178aae6a539669db93b4672e/main.pdf https://s3-eu-west-1.amazonaws.com/prod-ucs-content-store-eu-west/content/pii:S0022537166800154/MAIN/application/pdf/940ca9f2178aae6a539669db93b4672e/main.pdf main.pdf pdf true 407427 MAIN 5 1-s2.0-S0022537166800154-main_1.png https://s3-eu-west-1.amazonaws.com/prod-ucs-content-store-eu-west/content/pii:S0022537166800154/PREVIEW/image/png/858165d5b983c6f94c675c7d48a9c0c6/main_1.png https://s3.amazonaws.com/prod-ucs-content-store-us-east/content/pii:S0022537166800154/PREVIEW/image/png/858165d5b983c6f94c675c7d48a9c0c6/main_1.png main_1.png png 67930 849 656 IMAGE-WEB-PDF 1 JOURNAL OF VERBAL LEARNING AND VERBAL BEHAVIOR 5, 188-192 (1966) Some Evidence for Coding Processes Derived from Clustering in Free Recall* CHARLES N. COFER1 The Pennsylvania State University, University Park, Pennsylvania Clustering in free recall has often been interpreted as arising from use of the category name as a coding response. Data relevant to this interpretation are presented and inter- preted with regard to three free-recall situations in which clustering is observed. It is concluded that category names do not play roles in clustering in the category-clustering situation or in the difference in clustering found for sets of words which comprise all the items of a category as compared with word sets which do not exhaust a category. How- ever, there is evidence that the greater clustering found for categorized pair members than for noncategorized pair members with equal associative overlap may be due to the greater codability of the categorized pairs. Several investigators (Gonzalez and Cofer, 1959; Cohen, 1963a, 1963b; Miller, 1956) have suggested that the reorganization of words in the recall of randomly presented categorized or associated words, which is known as clustering in free recall, and the augmentation of word recall which generally accompanies this reorganization are evidences for coding processes in this situation. Typi- cally, the argument has been that the category names, in the case of word lists made up of items from (usually) four categories, serve as a code. Remembering these category names enables the S readily tO reproduce at recall a large number of the instances; in doing so, he emits related items together, thus displaying clustering. * Symposium on Coding and Conceptual Processes in Verbal Learning, held at the meetings of the American Psychological Association, Los Angeles, Sept., 1964. 1 This is Technical Report No. 4 under Contract Nonr 656(30) between the Office of Naval Research and the Pennsylvania State University. Acknowledg- ment is made to John A. Robinson for the use of his class to obtain the new data reported in the paper and to Elmer Davidson and Darryl Bruce who tabulated the data. In this paper, I shall summarize the results of several experiments and some new data which bear upon this interpretation. In the discussion of these materials, it will be helpful if I first define some terms and sketch more fully some aspects of possible coding opera- tions in the clustering situation. As to terms, the following definitions will be useful. In the first place, category cluster- ing refers to the use of lists composed of items which belong to one or the other of (usually) four mutually exclusive categories, such as animals, occupations, weapons and articles of clothing. High-frequency lists (HF) are lists of this kind which are made up of high-fre- quency associates of these category names and low-frequency lists (LF) are lists composed of low-frequency associates to the category names. The word frequency here refers to frequency of association, not to Thorndike- Lorge frequency, although the two kinds of frequency are often confounded. Random pre- sentation refers to the presentation of a set of items, like those just mentioned, in a se- quence determined from a table of random numbers. Block presentation means that all items of a category are presented one after 188 CODING PROCESSES 189 another, then the items of another category, and so on. Two associative variables may be identified in this situation. One is the associative strength between the category name and the instances; by definition this is high in HF and low in LF lists. The other associative factor is the strength of the associations among the instances themselves. This is prob- ably higher among HF items than among LF items. In considering clustering and word recall, one might argue that coding by the category name should be enhanced by block presenta- tion. When all the items from a category are presented together, the character of the cate- gory and its name should be more obvious than when the items are randomly arranged. This should be true for both HF and LF lists, and, in fact, when asked, Ss are able to identify the category name equally well after block presentation for both I-IF and LF lists (Cofer, Bruce, and Reicher, in press). What do clustering and word recall look like in HF and LF lists after random and block presenta- tion? Evidence for clustering under these conditions was obtained by Cofer et al. (in press, Exp. III). I-IF and LF lists composed of 40 items, belonging to 4 mutually exclusive categories, were presented, one item at a time by means of a memory drum, a single time to Ss, who, after presentation, were asked to recall (by writing) as many items as they could in any order in which the items occurred to them. Three item-presentation intervals (1, 2, or 4sec) were used in different groups. Block presentation augmented clustering of both lists at each interval, but the interaction between block presentation and list frequency was not significant at immediate re- call. However, the tendency to an interaction which was present showed greater augmentation of cluster- ing in the HF than in the LF list, and a possible ceiling effect in the case of the HF list (maximum clustering for the words recalled) probably obscured this interaction if it is present. For word recall, however, the interaction was clear. Here block pre- sentation clearly augments recall for the HF list but has little effect for this measure in the LF list. (These data are shown in Figs. 1 and 2 in the paper by Cofer et al., in press.) Now if the category name is equally avail- able in the HF and LF lists under block pre- sentation, as there is reason to believe that it is, then the differences between the two lists in clustering and word recall would seem to be dependent on other factors, for example, the associative possibilities I have already suggested. It is even possible than S does not use the category name at all. If he recalls a few of the instances he could easily associate others, in the case of the HF list, and perhaps less readily in the case of the LF list. The resulting clustering and word recall could well come out, on this basis, as they do. We have other evidence that without inter-word asso- ciations (Cofer and Bruce, 1965) cate- gorized words do not cluster under random presentation and not much under block presentation. In this experiment, the catego- ries were parts of speech (nouns, adjectives, and verbs), and the items were clear-cut in- stances of each category. College students can classify these items as to part of speech with great accuracy. However, there was little evi- dence that the Ss in the recall experiment actually perceived or used these categories. Let me turn now to some data of Cohen's. Cohen (1963a, 1963b) has compared recall for lists of categorized words which were of two types. In one case, the categories were exhaustive and were composed of either three or four words: blonde, brunette, and redhead form an exhaustive 3-word category, and north, south, east, and west from an exhaus- tive 4-word category. Suitable sets of three words and of four words were developed for the nonexhaustive cases. Cohen found that the exhaustive categories were recalled better and clustered more, on the average, than the nonexhanstive catego- ries. However, he found that the number of categories represented in recall (defined as the occurrence in recall of at least one word from the category) did not differentiate the two kinds of categories. Deese's (1959) measure of inter-item associative strength (I.I.A.S.) 190 COFER clearly showed higher values for the exhaus- tive than for the nonexhaustive sets and cor- related highly with word recall and clustering. It did not correlate with the number of cate- gories recalled, however. Cohen's associative data, which he kindly lent me, were analyzed, in an exploratory way, by means of two measures of association which differ from Deese's measure of inter-item associative strength. Both are described by Marshall and Cofer (1963). One is the Index of Total Association which includes all of the associations which any two words of a set have in common as a proportion of all of the associations the words of the set elicit. The other is the Index of Concept Cohesiveness, which takes account of all those associations elicited in common by all of the words in the set. Neither of these measures was successful in predicting the number of categories recalled by Cohen's Ss. These measures, as well as I.I.A.S., are based on single-word free associations to each item in relative isolation and therefore may not be adequate to Cohen's situation. I have therefore obtained data on some of Cohen's sets in the following way. A test was constructed in which S is asked, for a group of words presented as a group (such as blonde, brunette and redhead), to write a word or phrase which can relate or link the words together. We have tabulated the results from 50 Ss for five of Cohen's 3-word exhaustive, five 3-word nonexhaustive, five 4-word exhaustive and five 4-word nonexhaustive sets, selected more or less at random. The hypothesis was that, in accordance with Glanzer and Clark's (1963, 1964) verbal-loop hypothesis, the exhaustive cate- gory sets should yield more agreement among Ss in the linking words written down, and that perhaps the length of the linking words or phrases would be shorter for them than for the nonexhaustive category sets. This would suggest that the exhaustive sets are more codable than the nonexhaustive sets. For the 3-word sets, the results are the opposite of this hypothesis. For the five exhaustive 3-word sets the mean number of different responses is 21, whereas for the 3-word nonexhaustive sets it is 10.4. The 4-word sets yield results in the other direction and the differences are of about the same size. Thus the 4-word exhaustive sets give a mean of 10.6 different responses, the nonexhaustive sets a mean of 17.8. Mean length of response differs only slightly for exhaustive and nonexhaustive sets, the direction of the differences being again reversed for three- and four-word sets (6.60, Exhaustive, 3-word; 6.04, Non- exhaustive, 3 word; 5.88, Exhaustive, 4-word; 6.16, Nonexhaustive, 4-word). The frequencies of the most common responses, of course, follow the trends in the number of different responses. These data and the analyses by means of the two indices mentioned before do not sug- gest to me a consistent relation between coding responses and the facilitation of recall and clustering in the exhaustive as compared to the nonexhaustive category sets. My guess is that the augmented recall and clustering of the items in the exhaustive sets is entirely due to their high inter-item associative strengths. Recalling one item leads to the other items in the set through the strong associative net- works (and Cohen's demonstration of the "some or none" recall of sets in the present symposium supports this interpretation). Why the number of categories recalled should not differ between exhaustive and non- exhaustive sets is not explicable on the basis of associative and coding factors so far as we have explored them to date. My analyses of category clustering and of recall in the case of exhaustive and nonex- haustive sets of words suggest that coding, in the sense of the category name, is not a pro- cess necessary to postulate to account for the data obtained under the conditions summa- rized. However, there are other data on clus- tering, obtained under conditions in which associative factors are equated, which may be responsive to a coding interpretation. Let me discuss these data before concluding. In an unpublished dissertation, Marshall (1963) studied the clustering of pairs of words and cluster- ing of sets of 4 words. Pairs were divided into those in which both members of a pair fell into a common category (like spider-beetle) and those in which the pair members were not so easily categorized (like mountain-high), referred to as categorized and non- categorized pairs, respectively. The pairs were matched for associative overlap (mutual relatedness or MR index) to determine whether categorization would have an effect on clustering and recall, inde- pendent of association. In the recall experiments, the words were randomized (with some restrictions) and recalled in any order. The MR index is a measure, for a pair of words, which takes account of all the associations elicited in common by the two members of the pair (Marshall and Cofer, 1963). CODING PROCESSES 191 Marshall's experiments involved the presentation of lists of 24 words, one at a time, orally. The Ss wrote their recalls under instructions for free recall. A typical list contained words from 6 categorized and 6 noncategorized pairs, and clustering was scored for the re-pairing during recall of pair members which had been separated at list presentation. Clus- tering was compared for the categorized and non- categorized pair members over a wide range of MR values. The data show that the categorized items cluster significantly more than the noncategorized items, especially at intermediate levels of MR. Word recall is somewhat better, also, for the categorized than for the noncategorized items (see Fig. 10 in Cofer, 1965, for a presentation of these data for the clustering measure). Sheer number of inter-word associations cannot account for the differences in clustering or word recall for categorized and noncategorized items since, at any MR level, the two kinds of pairs have equal associative overlaps. However, I have examined other aspects of old associative data and have obtained new data for the two kinds of pairs in the range of MR values where clustering differences appear (MR levels from zero to 29.9) in an attempt to find differences. The first analysis is based on single-word free- associations to the isolated words. Here is what this analysis shows. For the sets of categorized and non- categorized pairs sampled, there are more different responses in common between two categorized pair members and the category name is often a common associate, though not always. For example, the cate- gory names bird, tool, and water account for 86, 20, and 67%, respectively, of the associative overlap between the pair members in the following pairs: eagle-crow, hammer=chisel, ocean-lake. This is not. so true of other categorized pairs like baby-child, violin-harp, and loot-knee; in the last of these, for example, body accounts for only 2% of the overlap. The noncategorized pairs do not show overlap through an associated category name, as, of course, by definition they cannot. Although this analysis from associations to single words does show differ- ences between categorized and noncategorized word pairs, we collected additional data to secure more information. The second analysis is based on data obtained from Ss who saw each pair as a pair and were asked to write a relating or linking word or phrase. There are no systematic differences in number of different words written, the frequency of the most common response, or length of responses so far as the MR levels are concerned. However, the C and NC dis- tinction is paralleled in these data, as Table 1 shows, TABLE 1 MEAN" NI.~I~BER OF DIFFERENT RESPONSES AND MEAN FREQUENCY OF POPULAR RESPONSES Different Frequency of responses popular responses MR C NC C NC O- 5 18.1 24.5 20.7 10.1 5- 9.9 18.0 26.5 18.5 13.1 10-19.9 15.8 25.5 22.0 13.4 20-29.9 17.8 27.0 19.7 11.8 by differences in the number of different responses and in the frequency of the most popular response. Length of response does not differentiate the C from the NC pairs. As the table indicates, there is a smaller number of different responses and the fre- quency of the popular response is higher for cate- gorized than for noncategorized pairs. The failure of these measures to differentiate among MR levels (and MR is a powerful factor in clustering and word recall) suggests the importance of the associative linkage between the words. How- ever, the results just shown in Table 1 suggest that the categorized items are more codable; i.e., there is more agreement in the words given to link the items, as reflected in the smaller number of words given to the categorized pairs and the higher frequencies of their populars. Coding, then, is perhaps the factor responsible for the augmented clustering and recall shown by the categorized word pairs. I mentioned, also, that Marshall studied 4-word sets. He varied, in such sets, the index of total asso- ciation and the index of concept cohesiveness. For a moderate level of ITA, the index of concept cohe- siveness produces an effect on clustering (it also affects word recall). It is of interest to examine the number of different words and the frequencies of the populars given by Ss asked to provide a linking word for each set of four words for the three levels of ICC at moderate ITA. Again we find a relation- ship. For the two low ICC sets the number of dif- ferent responses is 61, for the two intermediate ICC sets, it is 43, and for the two high ICC sets, it is 15. Clearly, as ICC rises, the number of different link- ing words declines. The frequency of the populars, of course, goes up with ICC: from 16 at low, 39 at intermediate, to 72 at high ICC. Again, length of response is not related to ICC differences. In this paper, I have analyzed three differ- ent free-recall situations in terms of whether a coding response, in the sense of a category name, may be inferred with some legitimacy. My conclusions are that in the four-category 192 COFER list, initially studied by Bousfield, it is prob- able that associations are the dominant factors in clustering and word recall observed; coding by the category name seems fninor, or un- likely, in terms of the analyses made. Simi- larly, in Cohen's work with exhaustive and nonexhaustive categories, associative factors, rather than category names, seem to account for the variance observed. However, in Mar- shall's studies of clustering and recall of cate- gorized and noncategorized pairs, and in the work with sets of four words varying in ICC, the operation of a coding factor (in addition to association) seems highly plausible. If Glanzer and Clark's verbal loop hypothesis requires length of coding response to vary, then the results of these studies offer no sup- port for it. If, on the other hand, number of different responses or frequency of the popu- lar is an appropriate index, then Marshall's results fit the hypothesis. One can probably not generalize too far from these analyses, but I think the evidence is consistent with coding by category name in Marshall's case and is not in Cohen's and the 4-category case. Of course, one could argue that the recall of one (George Miller's sug- gestion) or a few of the items in a category constitutes a code which leads to the genera- tion of other items by association. Thus, the notion of coding could be preserved, but it would not be by means of the category name. We should be clear about what possible processes may be called codes and in what situations the codes apply. Clearly, there must bemultiple ways of coding materials, and the category name is but one of them. REFERENCES COFER, C. N. On some factors in the organizational characteristics of free recall. Amer. Psychologist, 1965, 20, 261-272. COFER, C. N., A~ BRUCE, D. R. Form-class as the basis for clustering in the recall of non-associated words. Y. verb. Learn. verb. Behav., 1965, 4, 386-389. CORER, C. N., BRUCE, D. R., AND REICttER, G. M. Clustering in free recall as a function of certain methodological variables. J. exp. Psychol., in press. COHEN, B. H. An investigation of recoding in free recall. J. exp. Psychol., 1963, 65, 368-376 (a). CO~EN, B. H. Recall of categorized word lists. J. exp. Psychol., 1963, 66, 227-234 (b). DEESE, J. Influence of inter-item associative strength upon immediate free recall. Psychol. Rep., 1959, 5, 305-312. GLANZER, M., AND CLARK, W. H. Accuracy of per- ceptual recall: An analysis of organization. J. verb. Learn. verb. Behav., 1963, 1, 289-299 (a). GLANZER, M., AND CLARK, W. H. The verbal loop hypothesis: Binary numbers. J. verb. Learn. verb. Behav., 1963, 9., 301-309 (b). GONZALEZ, R. C., AND COFER, C. N. Exploratory studies of verbal context by means of clustering in free recall. J. genet. Psychol., 1959, 95, 293- 320. MARSHALL, G. R. The organization oJ verbal material in /ree recall: The effects o/ patterns oJ asso- ciative overlap on clustering. Unpublished Ph.D. dissertation, New York University, 1963. MARSHALL, G. R., AND COFER, C. N. Associative in- dices as measures of word relatedness: A sum- mary and comparison of ten methods. J. verb. Learn. verb. Behav., 1963, 1, 408-421. MILLER, G. A. The magical number seven, plus or minus two: Some limits on our capacity for processing information. Psychol. Rev., 1956, 63, 81-97. (Received October 12, 1964) YVLVB 66800154 S0022-5371(66)80015-4 10.1016/S0022-5371(66)80015-4 * Symposium on Coding and Conceptual Processes in Verbal Learning, held at the meetings of the American Psychological Association, Los Angeles, Sept., 1964. Symposium On Coding And Conceptual Processes In Verbal Learning Some evidence for coding processes derived from clustering in free recall Charles N. Cofer 1 The Pennsylvania State University, University Park, Pennsylvania, USA 1 This is Technical Report No. 4 under Contract Nonr 656(30) between the Office of Naval Research and the Pennsylvania State University. Acknowledgment is made to John A. Robinson for the use of his class to obtain the new data reported in the paper and to Elmer Davidson and Darryl Bruce who tabulated the data. Clustering in free recall has often been interpreted as arising from use of the category name as a coding response. Data relevant to this interpretation are presented and interpreted with regard to three free-recall situations in which clustering is observed. It is concluded that category names do not play roles in clustering in the category-clustering situation or in the difference in clustering found for sets of words which comprise all the items of a category as compared with word sets which do not exhaust a category. However, there is evidence that the greater clustering found for categorized pair members than for noncategorized pair members with equal associative overlap may be due to the greater codability of the categorized pairs. References Cofer, 1965 Cofer C.N. On some factors in the organizational characteristics of free recall Amer. Psychologist 20 1965 261 272 Cofer and Bruce, 1965 Cofer C.N. Bruce D.R. Form-class as the basis for clustering in the recall of non-associated words J. verb. Learn. verb. Behav. 4 1965 386 389 Cofer, Cofer, C. N., Bruce, D. R., and Reicher, G. M. Clustering in free recall as a function of certain methodological variables. J. exp. Psychol., in press. Cohen, 1963 (a) Cohen B.H. An investigation of recoding in free recall J. exp. Psychol. 65 1963 368 376 Cohen, 1963 (b) Cohen B.H. Recall of categorized word lists J. exp. Psychol. 66 1963 227 234 Deese, 1959 Deese J. Influence of inter-item associative strength upon immediate free recall Psychol. Rep. 5 1959 305 312 Glanzer and Clark, 1963 (a) Glanzer M. Clark W.H. Accuracy of perceptual recall: An analysis of organization J. verb. Learn. verb. Behav. 1 1963 289 299 Glanzer and Clark, 1963 (b) Glanzer M. Clark W.H. The verbal loop hypothesis: Binary numbers J. verb. Learn. verb. Behav. 2 1963 301 309 Gonzalez and Cofer, 1959 Gonzalez R.C. Cofer C.N. Exploratory studies of verbal context by means of clustering in free recall J. genet. Psychol. 95 1959 293 320 Marshall, 1963 Unpublished Ph.D. dissertation Marshall G.R. The organization of verbal material in free recall: The effects of patterns of associative overlap on clustering 1963 New York University Marshall and Cofer, 1963 Marshall G.R. Cofer C.N. Associative indices as measures of word relatedness: A summary and comparison of ten methods J. verb. Learn. verb. Behav. 1 1963 408 421 Miller, 1956 Miller G.A. The magical number seven, plus or minus two: Some limits on our capacity for processing information Psychol. Rev. 63 1956 81 97              
            '''

In [14]:
#print(text)

In [15]:
cleaned_text = cleaning_pipe(text)

In [16]:
cleaned_text

"data relevant to this interpretation are presented and inter- preted with regard to three free-recall situations in which clustering is observed.it is concluded that category names do not play roles in clustering in the category-clustering situation or in the difference in clustering found for sets of words which comprise all the items of a category as compared with word sets which do not exhaust a category.how- ever there is evidence that the greater clustering found for categorized pair members than for noncategorized pair members with equal associative overlap may be due to the greater codability of the categorized pairs.several investigators have suggested that the reorganization of words in the recall of randomly presented categorized or associated words which is known as clustering in free recall and the augmentation of word recall which generally accompanies this reorganization are evidences for coding processes in this situation.typi- cally the argument has been that the categ

## Reading the Files

In [17]:
dir_dirty_data = "../datasets/final_datasets/abstract_dataset"
out_dir_clean_data = "../datasets/cleaned_datasets/abstract_dataset"

In [18]:
if not os.path.isdir(out_dir_clean_data):
    os.makedirs(out_dir_clean_data)

In [19]:
%%time
for root, dirs, files in os.walk(dir_dirty_data):
    for _dir in dirs: 
        counter = 0
        for txt_file in [x for x in os.listdir(os.path.join(root, _dir)) if x.endswith((".txt", ".TXT"))]:
    # Class name = dir name
            class_name = _dir
            #Read File
            file_name = os.path.join(root, _dir, txt_file)
            file = open(file_name, "r")
            txt = file.read()
            #File long enough?
            if check_len_txt(txt):
                counter += 1
                text = cleaning_pipe(txt)
                if check_len_txt(text):
                    path = os.path.join(out_dir_clean_data, _dir)
                    file_name = class_name+"_"+str(counter)+".txt"
                    make_dir(path)
                    file_new = open(os.path.join(path, file_name), "w")
                    file_new.write(text)
                    file_new.close()
            file.close()

CPU times: user 4min 47s, sys: 19.5 s, total: 5min 7s
Wall time: 5min 20s


In [60]:
dir_dirty_data = "../datasets/filtered_dataset"
out_dir_clean_data = "../datasets/cleaned_dataset_160119"

In [61]:
if not os.path.isdir(out_dir_clean_data):
    os.makedirs(out_dir_clean_data)

In [62]:
%%time
for root, dirs, files in os.walk(dir_dirty_data):
    for _dir in dirs: 
        counter = 0
        for txt_file in [x for x in os.listdir(os.path.join(root, _dir)) if x.endswith((".txt", ".TXT"))]:
    # Class name = dir name
            class_name = _dir
            #Read File
            file_name = os.path.join(root, _dir, txt_file)
            file = open(file_name, "r")
            txt = file.read()
            #File long enough?
            if check_len_txt(txt):
                counter += 1
                text = cleaning_pipe(txt)
                if check_len_txt(text):
                    path = os.path.join(out_dir_clean_data, _dir)
                    file_name = class_name+"_"+str(counter)+".txt"
                    make_dir(path)
                    file_new = open(os.path.join(path, file_name), "w")
                    file_new.write(text)
                    file_new.close()
            file.close()

CPU times: user 10.3 s, sys: 3.24 s, total: 13.5 s
Wall time: 15.4 s


## Playground

In [98]:
text = '''abstract The magnetic and transport behavior of the Nd0.7Sr0.3MnO3 perovskite, as dependent on sintering time t s,
and sintering temperature T S, has been investigated by X-ray diffraction,
magnetization M(T) and resistivity R(T) measurements.
A strong decrease of the Curie temperature T C with increasing T S has been observed, when the latter is not higher than 1250°C.
The lowest

T C is about 155 K. For T s=1300°C,

the samples show a surprisingly increased T C, of about 231K,
although their XRD patterns are not distinguishable from those of the differently sintered samples.
Peak temperatures T P of the R(T) curves are close to the corresponding T C.
Two different clustering mechanisms are supposed to govern the magnetic and transport behavior, one for T S<1250°C,
the other one for T S>1250°C. 
Evidences for a short-range magnetic ordering established at around 275K are also shown.
'''

In [371]:
text = lower_case(text)

In [372]:
len(word_tokenize(text))

173

In [373]:
calculate_noise("Two different clustering mechanisms are supposed to govern the magnetic and transport behavior, one for T S<1250°C,the other one for T S>1250°C. ")

['different', 'clustering', 'mechanisms', 'are', 'supposed', 'to', 'govern', 'the', 'magnetic', 'and', 'transport', 'behavior', 'one', 'for', 'the', 'other', 'one', 'for']
18
30


0.4

In [374]:
print("Two different clustering mechanisms are supposed to govern the magnetic and transport behavior, one for T S<1250°C,the other one for T S>1250°C. ")

Two different clustering mechanisms are supposed to govern the magnetic and transport behavior, one for T S<1250°C,the other one for T S>1250°C. 


In [210]:
text

'abstract the magnetic and transport behavior of the nd0.7sr0.3mno3 perovskite, as dependent on sintering time t s,\nand sintering temperature t s, has been investigated by x-ray diffraction,\nmagnetization m(t) and resistivity r(t) measurements.\na strong decrease of the curie temperature t c with increasing t s has been observed, when the latter is not higher than 1250°c.\nthe lowest\n\nt c is about 155 k. for t s=1300°c,\n\nthe samples show a surprisingly increased t c, of about 231k,\nalthough their xrd patterns are not distinguishable from those of the differently sintered samples.\npeak temperatures t p of the r(t) curves are close to the corresponding t c.\ntwo different clustering mechanisms are supposed to govern the magnetic and transport behavior, one for t s<1250°c,\nthe other one for t s>1250°c. \nevidences for a short-range magnetic ordering established at around 275k are also shown.\n'

In [38]:
re_abstract = re.compile(r"^abstract |Abstract |Summary |summary ")
re_single_chars = re.compile(r'\b\S{1,2}\b') # Use this only if W not in Stop_words
re_numb = re.compile(r"\s\d+\s")
re_unkown = re.compile(r"(\s+\S+\d+\S*)|(\s+\S*\d+\S+)") #Chars followed by numb or numb followed by chars
re_formula = re.compile(r"(\w*\W+\d+)") # Formulas without chars
re_whitespace = re.compile(r"/^\s+|\s+$|\s+(?=\s)/g")
#re_numb = re.compile(r"(\w+\d+)|(\d+\w+)|(/d)")



In [39]:
text = lower_case(text)
text = remove_2_grams(text)
text = re.sub(re_abstract,"",text)
text = re.sub(re_numb," <numb> ",text)
text = re.sub(re_unkown," <unkw> ",text)
text = re.sub(re_unkown," <formula> ",text)
text = re.sub(re_whitespace," <formula> ",text)

In [40]:
text

'the paper presents the results on sequential sowing experiments carried out as a part of the international theme initial experiments within photosynthesis programme of ibp/pp the net assimilation rate and relative growth rate were determined from weekly increments of the total biomass and leaf area in young sunflower plants variety pole star cultivated during four growing seasons <unkw>  for fitting the changes in and multiple regression was used a <var> <var> a <var> <var> a where <var> means daily air temperature <var> sum of solar radiation phar leaf area ratio and parameters of the regression equation the significance of single terms in regression equation is tested and their differences in individual years analysed the more pronounced constancy of prediction is to be found in net assimilation rate than in relative growth rate the possibilities of generalizing these conclusions are discussed <formula> '

# Notes

In [10]:
print("\tTab")
print(r"\tTab")# raw string

	Tab
\tTab


In [11]:
# Meta Characters = Characters which need to be escaped with \
# . ^$ * + ? { } [] \ \ () 
