# Memory and Unicode

In [68]:
import csv
import pandas as pd
from collections import Counter

### Exploring binary numbers

In [2]:
# Let's say b is a binary number

b = "10"
print(int(b, 2))

2


In [3]:
base_10_100 = int("100", 2)
base_10_100

4

### Adding binary numbers

In [4]:
a = 9
a += 1
print(a)

10


In [5]:
a = 19
a += 1
a

20

In [6]:
a = 99
a += 1
a

100

In [7]:
def binary_add(a, b):
    return bin(int(a, 2) + int(b, 2))[2:]

In [8]:
b = "1"
c = binary_add(b, "1")
c

'10'

In [9]:
bin(5)

'0b101'

In [10]:
bin(8)

'0b1000'

### Converting binary values to other bases

In [11]:
a = 0
b = "0"

for i in range(0, 10):
    a += 1
    b = binary_add(b, "1")
    print(int(b, 2) == a)

True
True
True
True
True
True
True
True
True
True


In [12]:
base_10_1001 = int("1001",2)
base_10_1001

9

### Converting characters to binary

In [13]:
ord('a')

97

In [14]:
bin(ord('a'))

'0b1100001'

In [15]:
bin(ord('ÿ'))

'0b11111111'

In [16]:
binary_w = bin(ord('w'))
binary_w

'0b1110111'

In [17]:
binary_bracket = bin(ord('}'))
binary_bracket

'0b1111101'

In [18]:
code_point = "မ"
binary_1019 = bin(ord(code_point))
binary_1019 

'0b1000000011001'

### Exploring the Unicode

In [19]:
code_point = "⟶"
code_point

'⟶'

In [20]:
ord(code_point)

10230

In [21]:
bin(ord(code_point))

'0b10011111110110'

### Making Unicode strings

In [22]:
a ="\u1019"
b = "မ"
a == b

True

In [23]:
s3 = "hello မ"
s3

'hello မ'

### The Bytes data type

In [24]:
superman = "Clark Kent␦"
superman

'Clark Kent␦'

In [25]:
superman_bytes = "Clark Kent␦".encode("utf-8")
superman_bytes

b'Clark Kent\xe2\x90\xa6'

In [26]:
batman = "Bruce Wayne␦"
batman_bytes = batman.encode("utf-8")
batman_bytes

b'Bruce Wayne\xe2\x90\xa6'

### Hexadecimal conversions

In [27]:
int("F", 16)

15

In [28]:
int("A", 16)

10

In [29]:
def hexadecimal_add(a, b):
    return hex(int(a, 16) + int(b, 16))[2:]

In [30]:
value = "9"
value = hexadecimal_add(value, "1")
value

'a'

In [31]:
hex_ea = hexadecimal_add("ea","2")
hex_ea

'ec'

In [32]:
hex_ef = hexadecimal_add("f","e")
hex_ef

'1d'

### Hex to binary conversion

In [33]:
hex_byte = "â"
ord(hex_byte)

226

In [34]:
int("e2", 16)

226

In [35]:
bin(ord("â"))

'0b11100010'

In [36]:
binary_aa = bin(ord("\xaa"))
binary_aa

'0b10101010'

In [37]:
binary_ab = bin(ord("\xab"))
binary_ab

'0b10101011'

### String and Byte data types

In [38]:
hulk_bytes = "Bruce Banner␦".encode("utf-8")
hulk_bytes

b'Bruce Banner\xe2\x90\xa6'

In [39]:
try:
    hulk_bytes.replace("Banner", "")
except Exception:
    print("TypeError with replacement")

TypeError with replacement


In [40]:
hulk_bytes = b"Bruce Banner"
hulk_bytes

b'Bruce Banner'

In [41]:
hulk_bytes.replace(b"Banner", b"")

b'Bruce '

In [42]:
thor_bytes = b"Thor"
thor_bytes

b'Thor'

### Decoding Bytes to Strings

In [43]:
aquaman_bytes = b"Who knows?"
aquaman = aquaman_bytes.decode("utf-8")
aquaman

'Who knows?'

In [44]:
type(aquaman)

str

In [45]:
morgan_freeman_bytes = b"Morgan Freeman"
morgan_freeman = morgan_freeman_bytes.decode("utf-8")
morgan_freeman

'Morgan Freeman'

### Reading in file data

In [46]:
f = open("legislators.csv", 'r', encoding="utf-8")
csvreader = csv.reader(f)
legislators = list(csvreader)
legislators[0:5]

[['last_name', 'first_name', 'birthday', 'gender', 'type', 'state', 'party'],
 ['Bassett', 'Richard', '1745-04-02', 'M', 'sen', 'DE', 'Anti-Administration'],
 ['Bland', 'Theodorick', '1742-03-21', '', 'rep', 'VA', ''],
 ['Burke', 'Aedanus', '1743-06-16', '', 'rep', 'SC', ''],
 ['Carroll', 'Daniel', '1730-07-22', 'M', 'rep', 'MD', '']]

In [47]:
legislators_df = pd.DataFrame(legislators)

In [48]:
legislators_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,last_name,first_name,birthday,gender,type,state,party
1,Bassett,Richard,1745-04-02,M,sen,DE,Anti-Administration
2,Bland,Theodorick,1742-03-21,,rep,VA,
3,Burke,Aedanus,1743-06-16,,rep,SC,
4,Carroll,Daniel,1730-07-22,M,rep,MD,


In [49]:
legislators_df.iloc[0,:]

0     last_name
1    first_name
2      birthday
3        gender
4          type
5         state
6         party
Name: 0, dtype: object

In [50]:
legislators_df = pd.DataFrame(legislators[1:], columns=legislators[0])
legislators_df.head()

Unnamed: 0,last_name,first_name,birthday,gender,type,state,party
0,Bassett,Richard,1745-04-02,M,sen,DE,Anti-Administration
1,Bland,Theodorick,1742-03-21,,rep,VA,
2,Burke,Aedanus,1743-06-16,,rep,SC,
3,Carroll,Daniel,1730-07-22,M,rep,MD,
4,Clymer,George,1739-03-16,M,rep,PA,


### Reading in another file data

The data set contains excerpts from CIA memos that detail covert activities. It includes the year the statement was made, then an excerpt from the memo. The file, sentences_cia.csv, is in CSV format. 

In [51]:
f = open("sentences_cia.csv", 'r', encoding="utf-8")
csvreader = csv.reader(f)
sentences_cia = list(csvreader)

In [52]:
sentences_cia[1][0]

'1997'

In [53]:
sentences_cia[1][1]

'The FBI information included that al-Mairi\'s brother "traveled to Afghanistan in 1997-1998 to train in Bin - Ladencamps."'

In [54]:
sentences_cia[9][1]

'\'^^^ Prior to Abu Zubaydah\'s capture, the CIA considered Hassan Ghul a "First Priority Raid Target," based on reporting that: 97470(281317ZMAR02)("InNovember1998, [Muhammad] Atta, [Ramzi] Binalshibh, and [Said] Bahaji moved into the 54 Marienstrasse apartment in Hamburg that became the hub of the Hamburg cell.").'

In [55]:
# Converting sentences_cia to a dataframe

sentences_cia = pd.DataFrame(sentences_cia[1:], columns=sentences_cia[0])

In [56]:
sentences_cia.head()

Unnamed: 0,year,statement,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,1997,The FBI information included that al-Mairi's b...,,,
1,1997,The FBI information included that al-Mairi's b...,,,
2,1997,"For example, on October 12, 2004, another CIA ...",,,
3,1997,"On October 16, 2001, an email from a CTC offic...",,,
4,1997,"For example, on October 12, 2004, another CIA ...",,,


#### Clean up sentences

In [57]:
# The integer codes for all the characters we want to keep

good_characters = [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 32]

In [58]:
sentence_15 = sentences_cia["statement"][14]
sentence_15

'"^^\'\'^ There was also CIA reporting in 1998 that KSM was "very close" to On June 12, 2001, it was reported that "Khaled" was actively recruiting people to travel outside Afghanistan, including to the United States where colleagues were reportedly already in the country to meet them, to carry out terrorist-related activities for UBL.'

In [60]:
cleaned_sentence_15_list = [s for s in sentence_15 if ord(s) in good_characters]
cleaned_sentence_15_list[:10]

[' ', 'T', 'h', 'e', 'r', 'e', ' ', 'w', 'a', 's']

In [61]:
cleaned_sentence_15 = "".join(cleaned_sentence_15_list)
cleaned_sentence_15

' There was also CIA reporting in 1998 that KSM was very close to On June 12 2001 it was reported that Khaled was actively recruiting people to travel outside Afghanistan including to the United States where colleagues were reportedly already in the country to meet them to carry out terroristrelated activities for UBL'

In [62]:
def clean_statement(row):
    good_characters = [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 32]
    s = row["statement"]
    clean = [st for st in s if ord(st) in good_characters]
    return "".join(clean)

In [63]:
sentences_cia["cleaned_statement"] = sentences_cia.apply(clean_statement, axis=1)
sentences_cia["cleaned_statement"].head()

0    The FBI information included that alMairis bro...
1    The FBI information included that alMairis bro...
2    For example on October 12 2004 another CIA det...
3    On October 16 2001 an email from a CTC officer...
4    For example on October 12 2004 another CIA det...
Name: cleaned_statement, dtype: object

In [64]:
sentences_cia.head()

Unnamed: 0,year,statement,Unnamed: 3,Unnamed: 4,Unnamed: 5,cleaned_statement
0,1997,The FBI information included that al-Mairi's b...,,,,The FBI information included that alMairis bro...
1,1997,The FBI information included that al-Mairi's b...,,,,The FBI information included that alMairis bro...
2,1997,"For example, on October 12, 2004, another CIA ...",,,,For example on October 12 2004 another CIA det...
3,1997,"On October 16, 2001, an email from a CTC offic...",,,,On October 16 2001 an email from a CTC officer...
4,1997,"For example, on October 12, 2004, another CIA ...",,,,For example on October 12 2004 another CIA det...


#### Converting statements to tokens

In [66]:
combined_statements = " ".join(sentences_cia["cleaned_statement"])

statement_tokens = combined_statements.split(" ")
statement_tokens[:10]

['The',
 'FBI',
 'information',
 'included',
 'that',
 'alMairis',
 'brother',
 'traveled',
 'to',
 'Afghanistan']

#### Filtering the tokens

In [67]:
filtered_tokens = [s for s in statement_tokens if len(s)>=5]
filtered_tokens[:10]

['information',
 'included',
 'alMairis',
 'brother',
 'traveled',
 'Afghanistan',
 '19971998',
 'train',
 'Ladencamps',
 'information']

#### Counting the tokens

In [70]:
filtered_token_counts = Counter(filtered_tokens)
filtered_token_counts

Counter({'information': 375,
         'included': 49,
         'alMairis': 4,
         'brother': 9,
         'traveled': 25,
         'Afghanistan': 39,
         '19971998': 4,
         'train': 6,
         'Ladencamps': 4,
         'example': 41,
         'October': 138,
         'another': 22,
         'detainee': 128,
         'explained': 6,
         'alKuwaiti': 177,
         'guesthouse': 2,
         'operated': 4,
         'Shaykh': 27,
         'alLibi': 6,
         'Zubaydah': 328,
         'email': 104,
         'officer': 39,
         'tracking': 5,
         'since': 26,
         'stated': 96,
         'although': 13,
         'proof': 4,
         'needed': 9,
         'believe': 24,
         'mastermind': 14,
         'behind': 7,
         'attacks': 66,
         'Interrogators': 13,
         'Disagree': 3,
         'Headquarters': 72,
         'About': 13,
         'AlNashiris': 3,
         'Level': 4,
         'Cooperation': 3,
         'Oppose': 3,
         'Continued':

#### Finding the most common tokens

In [73]:
common_tokens = filtered_token_counts.most_common(3)
common_tokens

[('interrogation', 391), ('information', 375), ('REDACTED', 375)]

#### Finding the most common tokens by year

In [74]:
def find_most_common_by_year(year, sentences_cia):
    data = sentences_cia[sentences_cia["year"] == year]
    combined_statement = " ".join(data["cleaned_statement"])
    statement_split = combined_statement.split(" ")
    counter = Counter([s for s in statement_split if len(s) > 4])
    return counter.most_common(2)

In [75]:
common_2000 = find_most_common_by_year("2000", sentences_cia)
common_2000

[('terrorist', 9), ('Ahmad', 9)]

In [76]:
common_2002 = find_most_common_by_year("2002", sentences_cia)
common_2002

[('interrogation', 275), ('Zubaydah', 252)]

In [77]:
common_2013 = find_most_common_by_year("2013", sentences_cia)
common_2013

[('Response', 196), ('states', 111)]