# Installing Libraries

Language Check Library

In [None]:
! sudo apt install openjdk-8-jdk

In [None]:
! sudo update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java

In [None]:
! pip install language-check

Textstat Library

In [None]:
pip install textstat

VaderSentiment Library

In [None]:
pip install vaderSentiment

Downloading PunktSentenceTokenizer

In [None]:
nltk.download('punkt')

Downloading Stopwords module

In [None]:
nltk.download('stopwords')

# Importing Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import nltk
import string
import re, collections
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import language_check
import textstat
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Importing and Visualising the Data

Importing Data

In [None]:
dataset = pd.read_excel('training_set_rel3.xls')
dataset.head()

Checking the dimensions of the data i.e. no. of rows and columns

In [None]:
data.shape

Data Description

In [None]:
data.describe()

Checking for empty fields

In [None]:
data.isnull().sum()

Finding the number of records for each column for each of the eight essay sets

In [None]:
data.groupby('essay_set').agg('count')

Copying the contents of data in a new variable to avoid manipulation of the original data

In [None]:
temp_data = data[['essay_set','essay','domain1_score']].copy()
temp_data.head()

# Preprocessing : Feature Extraction

Cleaning the text using regex function

In [None]:
def process_text(essay):

    essay = str(essay)
    result = re.sub(r'http[^\s]*','',essay)  #removing url
    result = re.sub('[0-9]+','', result).lower() # remove numbers and lowercase the text
    result = re.sub('@[a-z0-9]+', '', result) #Eg: @caps1 will be removed

    return re.sub('[%s]*' % string.punctuation, '',result) #remove punctuation

temp_data['clean_essay'] = temp_data['essay'].apply(process_text)

Here, we are using ascii encoding on the string, ignoring the ones that can't be converted and then again decoding it.

In [None]:
def decode_essay(essay):

    return essay.encode('ascii', 'ignore').decode('ascii')

For Splitting sentences in the paragraph using PunktSentenceTokenizer


In [None]:
def tokenize_essay(essay):

    strip_essay = essay.strip()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(strip_essay)
    tokenized_sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            tokenized_sentences.append(convert_essay_to_wordlist(raw_sentence))

    return tokenized_sentences

Tokenizing the sentences to words

In [None]:
def convert_essay_to_wordlist(sentence):

    clean_sentence = re.sub("[^a-zA-Z0-9]"," ", sentence)
    wordlist = nltk.word_tokenize(clean_sentence)

    return wordlist

temp_data['clean_essay'] = temp_data['clean_essay'].apply(convert_essay_to_wordlist)

Removing stopwords

In [None]:
def remove_stopwords(text):

    words = [word for word in text if word not in stopwords.words('english')]

    return words

temp_data['clean_essay'] = temp_data['clean_essay'].apply(lambda x: remove_stopwords(x))

In [None]:
def clean_length(token):

    return [i for i in token if len(i)>2]

temp_data['clean_essay'] = temp_data['clean_essay'].apply(clean_length)

Calculating Number of sentences

In [None]:
def sent_count(essay):

    sentences = nltk.sent_tokenize(essay)                 #using sent_tokenize to convert paragraph into sentences

    return len(sentences)

Calculating Number of Words

In [None]:
def word_count(essay):

    clean_essay = re.sub(r'\W',' ', essay)                 #equivalent to [^a-zA-Z0-9]
    words = nltk.word_tokenize(clean_essay)

    return len(words)

Sentiment Analysis using VADER

In [None]:
def sentiment(text):

  senti_obj = SentimentIntensityAnalyzer()
  sentiment_dict = senti_obj.polarity_scores(text)

  return sentiment_dict['pos'], sentiment_dict['neg'], sentiment_dict['neu'], sentiment_dict['compound']


Flesch Kincaid Grade Level

In [None]:
def flesch_kincaid_grade(text):

  grade = textstat.flesch_kincaid_grade(text)

  return grade

Language Check

In [None]:
def language_check(text):

    error = 0
    matches = tool.check(text)
    error = error + len(matches)

    return error