-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_data.sh
75 lines (57 loc) · 2.88 KB
/
get_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
# Author: J. F. P. (Richard) Scholtens
# This program extracts data from the Googl Ngram dataset version 20120701
# The data within these datasets is formatted in the following way:
# ngram TAB year TAB match_count TAB volume_count NEWLINE
# For example:
# grey 1988 84121 13926
# By changing the IFS code newlines will keep exsisting when stored to variables.
IFS=
# All Google Ngram dataset filenames.
a=googlebooks-eng-all-1gram-20120701-a
c=googlebooks-eng-all-1gram-20120701-c
f=googlebooks-eng-all-1gram-20120701-f
g=googlebooks-eng-all-1gram-20120701-g
m=googlebooks-eng-all-1gram-20120701-m
# Format for printing tables.
divider===============================
divider=$divider$divider$divider$divider
header="\n%-25s|%19s|%19s|%19s|\n"
format="%-25s|%19s|%19s|%19s|\n"
width=100
get_frequencies () {
# Frequencies of word and books for the first word seperated by 1900/1945 and 1946/1999.
word1fr_0045=$(cat $1 | grep -wi "\<"$2"\>" | awk '$2 >= 1900 && $2 <=1945' | awk '{sum += $3} END {print sum}')
word1fr_4699=$(cat $1 | grep -wi "\<"$2"\>" | awk '$2 >= 1946 && $2 <=1999' | awk '{sum += $3} END {print sum}')
word2fr_0045=$(cat $1 | grep -wi "\<"$3"\>" | awk '$2 >= 1900 && $2 <=1945' | awk '{sum += $3} END {print sum}')
word2fr_4699=$(cat $1 | grep -wi "\<"$3"\>" | awk '$2 >= 1946 && $2 <=1999' | awk '{sum += $3} END {print sum}')
# If variables are empty it will be given the number 0..
[ -z "$word1fr_0045" ] && word2fr_0045=0
[ -z "$word1fr_4699" ] && word2fr_4699=0
[ -z "$word2fr_0045" ] && word2fr_0045=0
[ -z "$word2fr_4699" ] && word2fr_4699=0
# Sum up total frequency words column.
total_word_c1=$((word1fr_0045 + $word2fr_0045))
total_word_c2=$((word1fr_4699 + $word2fr_4699))
# Sum up total frequency words row.
total_word_r1=$((word1fr_0045 + $word1fr_4699))
total_word_r2=$((word2fr_0045 + $word2fr_4699))
total_word=$((total_word_r1 + total_word_r2))
# Print table to display word frequencies.
echo -e "\n###" $2/$3 "###" >> frequency_tables.out
printf "$header" "Frequency words" "Before 1945" "After 1945" "Total" >> frequency_tables.out
printf "%$width.${width}s\n" "$divider" >> frequency_tables.out
printf "$format" \
"British "$2 $word1fr_0045 $word1fr_4699 $total_word_r1\
"American "$3 $word2fr_0045 $word2fr_4699 $total_word_r2 >> frequency_tables.out
printf "%$width.${width}s\n" "$divider" >> frequency_tables.out
printf "$format" \
"Total " $total_word_c1 $total_word_c2 $total_word >> frequency_tables.out
python3 chi_square.py "$total_word_c1" "$total_word_c2" "$total_word_r1" "$total_word_r2" "$total_word" "$word1fr_0045" "$word1fr_4699" "$word2fr_0045" "$word2fr_4699" "$2" "$3" >> frequency_tables.out
}
# Get frequency information from words.
get_frequencies $a "Aluminium" "Aluminum"
get_frequencies $c "Colour" "Color"
get_frequencies $f "Flexitime" "Flextime"
get_frequencies $g "Grey" "Gray"
get_frequencies $m "Mum" "Mom"