# Notebook 2 - Features Extraction
 Extract various features from the given data and save them to a file.

### Importing all the libraries needed for this notebook

In [1]:
# Import pandas and numpy data structures to handle the data.
import pandas as pd
import numpy as np

# Importing nltk for word tokenization, sentence tokenization, 
# removing stopwords, pos taging, vocabualry richness calcuation.
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import brown
from nltk.tag import pos_tag

from collections import Counter

### Loading the data from the given data set, encoding in latin-1

In [2]:
df = pd.read_csv('data.csv', encoding='latin-1')
df = df.drop('Unnamed: 0',axis=1)
df = df.drop('essay_set',1)
essay_df = df['essay']

In [3]:
df.head()

Unnamed: 0,essay,domain1_score
0,"Dear local newspaper, I think effects computer...",6.0
1,"Dear @CAPS1 @CAPS2, I believe that using compu...",7.0
2,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",5.0
3,"Dear Local Newspaper, @CAPS1 I have found that...",8.0
4,"Dear @LOCATION1, I know having computers has a...",6.0


### 1. Tokenizing essays into sentences 
### 2. Calculating the sentence count and number of tagged words per essay

In [4]:
# Importing the clean_essay function from the functions.py file 
# to clean the data, remove punctuations and handle the words staring with @.
from functions import clean_essay

essay_sentences_list = []
tokenized_sentences = []
tagged_words_count_list = []

for essay in essay_df:
    temp_list = []
    essay_sentences_list.append(sent_tokenize(essay))
    essay, tagged_words_count = clean_essay(essay)
    tagged_words_count_list.append(tagged_words_count)
    
    for token in essay.split():
        temp_list.append(token)
        
    tokenized_sentences.append(temp_list)

In [5]:
# Count the number of sentences in each essay.
sentences_count = []

for sentences in essay_sentences_list:
    sentences_count.append(len(sentences))

### Genaerating the features:

#### The following features will be generated  : -

All the counts are per essay - 

- Word count
- Sentence count
- Long words count( word length > 6)
- Average word length 
- Spelling error count 
- Words to sentences ratio
- Vocabulary richness count
- Noun Count
- Verb Count
- Adverb Count
- Adjective Count


In [6]:
# Importing the yule function from function.py file to calculate vocabulary richness per essay.
from functions import yule

# Storing the set of stopwords
sw = set(stopwords.words('english'))

clean_tokenized_sentences = tokenized_sentences

# Store words from brown corpus to check for spelling errors.
word_list = brown.words()
word_set = set(word_list)

word_count = []
word_length = []
vocab_count = []
long_word_count = []
average_word_length = []
spelling_error_count = []
word_to_sentence_ratio = []

noun_count = []
verb_count = []
adjective_count = []
adverb_count = []

for i in range(essay_df.shape[0]):
    word_length = []
    long_word_counter, spelling_error_counter, domain_words_counter, wrong_word_counter  = 0, 0, 0, 0
    
    # Removing all the stopwords from essays.
    clean_tokenized_sentences[i] = [word for word in clean_tokenized_sentences[i] if word not in sw]
    
    word_count.append(len(clean_tokenized_sentences[i]))
    vocab_count.append(yule(clean_tokenized_sentences[i]))
    
    # Pos tagging all the words in the essay. 
    count = Counter([y for x,y in pos_tag(clean_tokenized_sentences[i])])
    
    # Saving count of various pos tags in lists
    noun_count.append(count['NN'] + count['NNS'] + count['NNPS'] + count['NNP'])
    verb_count.append(count['VB'] + count['VBG'] + count['VBP'] + count['VBN'] + count['VBZ'])
    adjective_count.append(count['JJ'] + count['JJR'])
    adverb_count.append(count['RB'] + count['RBR'] + count['RBS'])
    
    for word in clean_tokenized_sentences[i]:    
        word_length.append(len(word))
            
        # Checking for spelling errors.
        if word not in word_set:      
            spelling_error_counter += 1
        elif len(word) > 6:
            long_word_counter += 1
            
    long_word_count.append(long_word_counter) 
    spelling_error_count.append(spelling_error_counter)    
    
    # Calculating average word length per essay.
    average_word_length.append(round(np.sum(word_length) / float(len(word_length)), 2))
    
    # Calculating number of words per essay to number of sentences per essay ratio.
    word_to_sentence_ratio.append(round(word_count[i] / float(sentences_count[i]), 2))
    

In [7]:
# Updating word count by adding the tagged words which had been removed earlier.
word_count = [sum(x) for x in zip(word_count, tagged_words_count_list)]

### Storing all the features in a data frame:

In [8]:
features = pd.DataFrame(
    {
     'word_count': word_count,
     'sentences_count': sentences_count,
     'average_word_length': average_word_length,
     'long_word_count': long_word_count,
     'spelling_error_count': spelling_error_count,
     'word_to_sentence_ratio': word_to_sentence_ratio,
     'vocab_count': vocab_count,
     'noun_count': noun_count,
     'verb_count': verb_count,
     'adjective_count': adjective_count,
     'adverb_count': adverb_count
    })

In [9]:
features.head()

Unnamed: 0,adjective_count,adverb_count,average_word_length,long_word_count,noun_count,sentences_count,spelling_error_count,verb_count,vocab_count,word_count,word_to_sentence_ratio
0,28,11,5.58,45,71,16,16,38,29.482659,168,10.19
1,23,10,5.67,62,96,20,23,54,20.220089,226,10.8
2,17,2,6.05,51,72,14,4,36,15.286432,139,9.43
3,41,13,6.23,79,123,27,41,56,30.737705,301,9.63
4,25,13,5.94,69,113,30,20,46,20.258941,226,7.4


### Saving the features

Features saved in this file can directly be used for data analysis and model generation.

In [10]:
features.to_csv('features.csv')