## L2 Language Learner Classification 

Building a classifier to distinguish English text written by Lang-8 users whose native language (L1) is another European language (French and Spanish) from those written by L1 speakers of East Asian languages (Japanese, Korean, and Mandarin Chinese).

### Importing Libraries

In [1]:
from zipfile import ZipFile
from bs4 import BeautifulSoup

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk

nltk.download('words')
nltk.download("cmudict")

from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.corpus import cmudict

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

import string

[nltk_data] Downloading package words to
[nltk_data]     /Users/snehajhaveri/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/snehajhaveri/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


### Text Extraction

In [2]:
def read_file_from_zip(path="data/lang-8.zip"):
    """
    A generator function which reads html documents
    as raw text from the zip file

    Parameters
    ----------
    path : string
        path to the zip file

    Returns
    -------
    A dictionary of filename and raw text extracted
    from the file
    """
    archive = ZipFile(path, "r")

    for file in archive.namelist()[1:]:
        yield {
            "filename": file,#.removeprefix("lang-8/"),
            "data": archive.read(file)
        }

In [3]:
def extract_data_from_file(path="data/lang-8.zip"):
    """
    A generator function which reads html files from zip
    and extracts text and native language from the raw
    text

    Parameters
    ----------
    path : string
        path to the zip file

    Returns
    -------
    A dictionary of extracted content and native language
    of the author
    """
    for data_dict in read_file_from_zip(path):
        soup = BeautifulSoup(data_dict["data"])

#         author = soup.find_all("p", attrs={"class": "spaced"})[1].get_text().strip()
        native_lang = soup.find("li", attrs={"data-title": "Native language"}).get_text().strip()
        filename = data_dict["filename"]
        text = soup.find("div", attrs={"id": "body_show_ori"}).get_text().strip()

        preprocessed_data = {
            "text": text,
#             "author": author,
            "native_lang": native_lang,
            "filename": filename
        }

        yield preprocessed_data

### Feature Extraction

#### Text Length

In [5]:
def get_text_length(text):
    """
    Returns the number of words in a text without punctuations. 
    Counts clitics as separate words.

    Parameters
    ----------
    text : str
        A text from which we find the number of words

    Returns
    -------
    An int which represents the number of words in the text
    """
    non_punc = []
    for word in word_tokenize(text):
        if word not in string.punctuation:
            non_punc.append(word)
    return len(non_punc)

#### Lexical Density

In [6]:
def get_lexical_density(text):
    """
    Returns the lexical density of a text. That is the ratio of open class words.
    in the text

    Parameters
    ----------
    text : str
        A text from which we find the lexical density

    Returns
    -------
    A float which represents the lexical density
    """
    open_class_prefix = {"N", "V", "J", "R"}
    open_class_total = 0
    word_count = 0
    if len(text) == 0:
        return float(0)
    for word, pos in pos_tag(word_tokenize(text)):
        if word not in string.punctuation:
            word_count += 1
            if pos[0] in open_class_prefix:
                open_class_total += 1
    return open_class_total/word_count

#### Average Sentence Length

In [8]:
def get_average_sentence_length(text):
    """
    Returns the average sentence length of a text. Does not count punctuations and counts clitics.

    Parameters
    ----------
    text : str
        A text from which we find the average sentence length

    Returns
    -------
    A float which represents the average sentence length
    """
    if len(text) == 0:
        return float(0)
    sent_lengths = 0
    for sentence in sent_tokenize(text):
        word_count = 0
        for word in word_tokenize(sentence):
            if word not in string.punctuation:
                word_count += 1
        sent_lengths += word_count
    return sent_lengths/len(sent_tokenize(text))

#### Average Word Length

In [9]:
def get_average_word_length(text):
    """
    Returns the average sentence length of a text. Does not count punctuations 
    and counts clitics.

    Parameters
    ----------
    text : str
        A text from which we find the average sentence length

    Returns
    -------
    A float which represents the average sentence length
    """
    if len(text) == 0:
        return float(0)
    word_count = 0
    lengths_sum = 0
    for word in word_tokenize(text):
        if word not in string.punctuation:
            lengths_sum += len(word)
            word_count += 1
    return lengths_sum/word_count

In [10]:
s0 = ""
s1 = """I went to the park today. 
I love going there because I always have so much fun. 
I invited some friends but they didn't come. 
That's fine because I met a new person there. 
He had a dog.
""" #40, 
s2 = "I have so much work to do today. I am stressed" #11

# get_text_length
assert type(get_text_length(s0)) == int, "Must be an interger"
assert get_text_length(s0) == 0, "Empty string must return 0"
assert get_text_length(s1) == 40, "s1 has 40 words"
assert get_text_length(s2) == 11, "s2 has 11 words"
print("get_text_length tests pass")

assert type(get_lexical_density(s0))== float, "Must be a float"
assert get_lexical_density(s0) == 0, "Empty string must return 0"
assert get_lexical_density(s1) == 24/40, "24 open class words out of 40"
assert get_lexical_density(s2) == 8/11, "8 open class words out of 40"
print("get_lexical_density tests pass")

assert type(get_average_sentence_length(s0)) == float, "Must be a float"
assert get_average_sentence_length(s0) == 0, "Empty string must return 0"
assert get_average_sentence_length(s1) == 40/5, "40 words over the span of 5 sentences"
assert get_average_sentence_length(s2) == 11/2, "11 words over the span 2 sentences"
print("get_average_sentence_length tests pass")

assert type(get_average_word_length(s0)) == float, "Must be a float"
assert get_average_word_length(s0) == 0, "Empty string must return 0"
assert get_average_word_length(s1) == 142/40, "142 total characters spread across 40 words"
assert get_average_word_length(s2) == 35/11, "35 character spread across 11 words"
print("get_average_word_length tests pass")

get_text_length tests pass
get_lexical_density tests pass
get_average_sentence_length tests pass
get_average_word_length tests pass


#### Part of Speech (POS) Count

In [11]:
def get_pos_count(text):
    """
    Counts the number of nouns, verbs and adjectives in a text.

    Parameters
    ----------
    text : str
        A text for which we find the number of nouns, verbs
        and adjectives

    Returns
    -------
    A tuple of (noun_count: int, verb_count: int, adj_count: int)
    which represents the number of nouns, verbs adjectives in the text
    respectively
    """
    noun_count = 0
    verb_count = 0
    adj_count = 0

    if len(text) == 0:
        return 0, 0, 0

    for word, pos in pos_tag(word_tokenize(text)):
        if(pos[0] == 'N'):
            noun_count += 1
        if(pos[0] == 'V'):
            verb_count += 1
        if(pos == 'JJ'):
            adj_count += 1
    return noun_count, verb_count, adj_count

In [12]:
s1 = """I went to the park today. 
I love going there because I always have so much fun. 
I invited some friends but they didn't come. 
That's fine because I met a new person there. 
He had a dog."""

s2 = """Chelsea English School is offering a Summer School Program in Iwaki, Fukushima, a holiday learning experience combining enjoyment of the area's natural beauty and practical lifestyle immersion in the agricultural traditions of this part of Japan.
We will be hosted by "Namakiba" farm, an agricultural concern run by an Iwaki City cooperative, and activities include handson experience of organic farming,barbecues, local nature sightseeing including swimming in the river and the sea,the local fish market, guesthouses with onsens (hot spas) . The program promises new and fresh experiences in both nature and culture, and time will also be made available for gift shopping. Non-Japanese speakers are also warmly invited, as simultaneous translation into English will be available throughout the e trip."""

assert get_pos_count(s1) == (6, 10, 3)
assert get_pos_count(s2) == (47, 17, 16)

print("get_pos_count tests pass")

get_pos_count tests pass


#### Out of Vocabulary Words

In [13]:
def get_num_ovv_words(text):
    """
    Gets the number of out-of-vocabulary words in a text.

    Parameters
    ----------
    text : str
        A text for which we find the number of out-of-vocabulary
        words is to be found

    Returns
    -------
    The number of oov words in the text
    """
    text_vocab = set(w.lower() for w in text.split() if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    ovv_words = text_vocab - english_vocab

    return len(ovv_words)

In [14]:
s0 = ""
s1 = """ I haddd to leaasve earliae since yesterday was so tired.
And then I met you.
""" 
s2 = "I have so much work to do today. I am stressseed"
assert type(get_num_ovv_words(s0)) == int, "Must be an interger"
assert get_num_ovv_words(s0) == 0, "Empty string must return 0"
assert get_num_ovv_words(s1) == 3, "s1 has 3 words out of vaocab"
assert get_num_ovv_words(s2) == 1, "s2 has 1 words out of vocab"
print("get_num_ovv_words tests pass")

get_num_ovv_words tests pass


#### Reading Ease

In [15]:

# Code adapted from lab

vowels = {"a","e","i","o","u","y"}
p_dict = cmudict.dict()

def get_reading_ease(text):
    """Returns the reading ease for a text.

    Parameters
    ----------
    text : str
       A text for which we find the reading ease.

    Returns
    -------
    reading_ease : float
        The reading ease for the text
    """
    syllable_count = 0
    word_count = 0

    for word in word_tokenize(text):
        if word not in string.punctuation:
            word_count += 1
            if word in p_dict:
                for pron in p_dict[word][0]:
                    if pron[-1] in ['0','1','2']:
                        syllable_count +=1
            else:
                for j in range(0,len(word)):
                    if word[j].lower() in vowels:
                         syllable_count= syllable_count+1

    reading_ease = (206.835 - (1.015*(word_count/len(sent_tokenize(text))))- (84.6*(syllable_count/word_count)))
    return reading_ease

In [16]:
assert 100 < get_reading_ease("I am done, man") < 140
assert -60 < get_reading_ease("Felicitations for achieving a thoroughly excellent resolution to an altogether indombidable conundrum of humongous proportions.") <-20
print("get_reading_ease tests pass")

get_reading_ease tests pass


#### Punctuation Counts

In [17]:
def get_punctuations_count(text):
    """
    Returns the number of punctuations in a text.

    Parameters
    ----------
    text : str
        A text for which we find the number of punctuations present

    Returns
    -------
    punct_count: int
                 An integer which represents the number of punctuations in the text
    """
    punct_count = 0
    if len(text) == 0:
        return 0
    for word in word_tokenize(text):
        if word in string.punctuation:
            punct_count += 1
    return punct_count

In [18]:
s1 = """I went to the park today. 
I love going there because I always have so much fun. 
I invited some friends but they didn't come. 
That's fine because I met a new person there. 
He had a dog."""

s2 = """Chelsea English School is offering a Summer School Program in Iwaki, Fukushima, a holiday learning experience combining enjoyment of the area's natural beauty and practical lifestyle immersion in the agricultural traditions of this part of Japan.
We will be hosted by "Namakiba" farm, an agricultural concern run by an Iwaki City cooperative, and activities include handson experience of organic farming,barbecues, local nature sightseeing including swimming in the river and the sea,the local fish market, guesthouses with onsens (hot spas) . The program promises new and fresh experiences in both nature and culture, and time will also be made available for gift shopping. Non-Japanese speakers are also warmly invited, as simultaneous translation into English will be available throughout the e trip."""


assert get_punctuations_count(s1) == 5
assert get_punctuations_count(s2) == 16

print("get_punctuations_count tests pass")

get_punctuations_count tests pass


#### Type Token Ratio

In [19]:
def get_type_token_ratio(text):
    """
    Calculate type-token ratio from the text using the first
    num_words tokens

    Parameters
    ----------
    text : str
        A text for which we find the type-token ratio

    Returns
    -------
    type_token_ratio: int
                    An integer which represents the type token ratio for a given text
    """
    words = text.split()
    num_words = 100
    type_set = set(word.lower() for word in words[:num_words])
    return len(type_set) / num_words