# Extracting Features Manually

## Import Libraries

In [1]:
import pandas as pd

## Import Data File

In [2]:
with open('One.txt') as file:
    text = file.read()

In [3]:
text

'This is a story about dogs\nour canine pets\nDogs are furry animals\n'

In [4]:
print(text)

This is a story about dogs
our canine pets
Dogs are furry animals



In [5]:
with open('One.txt') as file:
    text_readlines = file.readlines()

In [6]:
text_readlines

['This is a story about dogs\n',
 'our canine pets\n',
 'Dogs are furry animals\n']

## Different Functions / Methods

In [7]:
text.lower()

'this is a story about dogs\nour canine pets\ndogs are furry animals\n'

In [8]:
text.lower().split()

['this',
 'is',
 'a',
 'story',
 'about',
 'dogs',
 'our',
 'canine',
 'pets',
 'dogs',
 'are',
 'furry',
 'animals']

## Building Vocabulary

In [9]:
with open('One.txt') as file:
    first_file_words = file.read().lower().split()
    unique_first_file_words = set(first_file_words) 

In [10]:
unique_first_file_words

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'dogs',
 'furry',
 'is',
 'our',
 'pets',
 'story',
 'this'}

In [11]:
with open('Two.txt') as file:
    second_file_words = file.read().lower().split()
    unique_second_file_words = set(second_file_words) 

In [12]:
unique_second_file_words

{'a',
 'about',
 'catching',
 'fun',
 'is',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

In [13]:
all_unique_words = set()
all_unique_words.update(unique_first_file_words)
all_unique_words.update(unique_second_file_words)

In [14]:
all_unique_words

{'a',
 'about',
 'animals',
 'are',
 'canine',
 'catching',
 'dogs',
 'fun',
 'furry',
 'is',
 'our',
 'pets',
 'popular',
 'sport',
 'story',
 'surfing',
 'this',
 'water',
 'waves'}

### Assigning Numbers to Each Words

In [15]:
vocabulary = dict()

count = 0

for word in all_unique_words:
    vocabulary[word] = count
    count = count + 1

In [16]:
vocabulary

{'waves': 0,
 'sport': 1,
 'dogs': 2,
 'our': 3,
 'a': 4,
 'animals': 5,
 'canine': 6,
 'are': 7,
 'is': 8,
 'this': 9,
 'about': 10,
 'furry': 11,
 'water': 12,
 'catching': 13,
 'fun': 14,
 'surfing': 15,
 'popular': 16,
 'story': 17,
 'pets': 18}

### Creating Empty List for Vocabularies

In [17]:
first_frequency = [0] * len(vocabulary)
second_frequency = [0] * len(vocabulary)
all_words = [''] * len(vocabulary)

In [18]:
first_frequency

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [19]:
for word in first_file_words:
    word_index = vocabulary[word]
    first_frequency[word_index] += 1

In [20]:
first_frequency

[0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1]

In [21]:
for word in second_file_words:
    word_index = vocabulary[word]
    second_frequency[word_index] += 1

In [22]:
second_frequency

[1, 1, 0, 0, 1, 0, 0, 0, 3, 1, 1, 0, 1, 1, 1, 2, 1, 1, 0]

In [23]:
for word in vocabulary:
    word_index = vocabulary[word]
    all_words[word_index] = word

In [24]:
all_words

['waves',
 'sport',
 'dogs',
 'our',
 'a',
 'animals',
 'canine',
 'are',
 'is',
 'this',
 'about',
 'furry',
 'water',
 'catching',
 'fun',
 'surfing',
 'popular',
 'story',
 'pets']

## Creating Final Dataframe

In [25]:
bag_of_words = pd.DataFrame(data=[first_frequency, second_frequency], columns=all_words)
bag_of_words

Unnamed: 0,waves,sport,dogs,our,a,animals,canine,are,is,this,about,furry,water,catching,fun,surfing,popular,story,pets
0,0,0,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1
1,1,1,0,0,1,0,0,0,3,1,1,0,1,1,1,2,1,1,0
