In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
import tensorflow_hub as hub
import warnings
import re
import csv
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

## Preprocess

In [5]:
# Function to convert  
def listToString(s): 
    
    # initialize an empty string
    str1 = "" 
    
    # traverse in the string  
    for ele in s: 
        str1 += ele
        str1 += " "
    
    # return string  
    return str1

In [6]:
def preprocess_text(sen):
    
    # lemmatization function
    lemma = WordNetLemmatizer()

    # collecting stopwords
    stop_words = set(stopwords.words('english'))
    
    # Removing html tags
    sentence = remove_tags(sen)
    
    # Removing brackets
    sentence = re.sub('\[[^]]*]', '', sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    # Removing punctuations

    tokenizer = nltk.RegexpTokenizer(r"\w+")
    sentence = tokenizer.tokenize(sentence)

    # converting to lower case

    sentence = [word.lower() for word in sentence]

    # .........removing stop words and perform lemmatization.............
    tkn = []
    for w in sentence:
        if not w in stop_words:
            x = lemma.lemmatize(w)
            tkn.append(w)
        else:
            pass

    return listToString(tkn)

In [7]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

# Single Word

In [8]:
df_train1 = pd.read_csv("Dataset/Sub-task 1/lcp_single_train.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_train2 = pd.read_csv("Dataset/Sub-task 1/lcp_single_trial.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_train2 = df_train2.rename(columns = {"subcorpus":"corpus"})
df_train = df_train2.append(df_train1, ignore_index=True)
label = df_train.columns[len(df_train.columns)-1]

In [9]:
X_sen_train = []
sentences = list(df_train['sentence'])
for sen in sentences:
    X_sen_train.append(preprocess_text(sen))
X_tok_train = []
y_sen_train = []

df_train['token'] = df_train['token'].astype(str)
sentences = list(df_train['token'])
for sen in sentences:
    X_tok_train.append(preprocess_text(sen))

In [10]:
y_sen_train = df_train['complexity'].tolist()
df_test = pd.read_csv("Dataset/Sub-task 1/lcp_single_test_labels.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

In [11]:
X_sen_test = []
sentences = list(df_test['sentence'])
for sen in sentences:
    X_sen_test.append(preprocess_text(sen))
X_tok_test = []
y_sen_test = []

df_test['token'] = df_test['token'].astype(str)
sentences = list(df_test['token'])

for sen in sentences:
    X_tok_test.append(preprocess_text(sen))

In [12]:
y_sen_test = df_test['complexity'].tolist()

## Universal Sentence Encoder

In [13]:
!pip3 install --upgrade tensorflow-gpu
# Install TF-Hub.
!pip3 install tensorflow-hub

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [15]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)
print ("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


## Extracting Features for training data

In [16]:
x_sen_train = []
for line in X_sen_train:
  x = model([line])[0]
  x.numpy()
  x_sen_train.append(x)

x_tok_train = []
for line in X_tok_train:
  x = model([line])[0]
  x.numpy()
  x_tok_train.append(x)

In [17]:
x_sen_train = np.array(x_sen_train)
x_tok_train = np.array(x_tok_train)
x_sen_train.shape

(8083, 512)

## Extracting Features for testing data

In [18]:
x_sen_test = []
for line in X_sen_test:
  x = model([line])[0]
  x.numpy()
  x_sen_test.append(x)

x_tok_test = []
for line in X_tok_test:
  x = model([line])[0]
  x.numpy()
  x_tok_test.append(x)
'''
x_cor_test = []
for line in X_cor_test:
  x = model([line])[0]
  x.numpy()
  x_cor_test.append(x)
'''

x_sen_test = np.array(x_sen_test)
x_tok_test = np.array(x_tok_test)

In [19]:
x_train = np.concatenate((x_sen_train, x_tok_train), axis=1)
x_test = np.concatenate((x_sen_test, x_tok_test), axis=1)

In [20]:
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr
from sklearn import svm
reg_model = svm.SVR()
reg_model.fit(x_train, y_sen_train)
y_pred = reg_model.predict(x_test)
print('\n***********************************************************************************\n')
print('Pearson corelation and p-value for Single word task:', pearsonr(y_sen_test, y_pred))
print('\n***********************************************************************************\n')


***********************************************************************************

Pearson corelation and p-value for Single word task: (0.6690901121350543, 4.433567314874437e-120)

***********************************************************************************



In [21]:
# saving predictions
warnings.filterwarnings('ignore')
df_test['Predicted Complexity'] = 0.00000
for i in range(len(df_test)):
    df_test['Predicted Complexity'][i] = y_pred[i]

df_final = df_test.drop(columns=['complexity','corpus','sentence','token'])
df_final.to_csv('single_test_predictions.csv',index=False,header=None)

# Multi-Word

In [22]:
df_train1 = pd.read_csv("Dataset/Sub-task 2/lcp_multi_train.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_train2 = pd.read_csv("Dataset/Sub-task 2/lcp_multi_trial.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')
df_train2 = df_train2.rename(columns = {"subcorpus":"corpus"})
df_train_multi = df_train2.append(df_train1, ignore_index=True)
label = df_train_multi.columns[len(df_train_multi.columns)-1]

In [23]:
X_sen_train_multi = []
sentences = list(df_train_multi['sentence'])
for sen in sentences:
    X_sen_train_multi.append(preprocess_text(sen))
X_tok_train_multi = []
y_sen_train_multi = []

df_train_multi['token'] = df_train_multi['token'].astype(str)
sentences = list(df_train_multi['token'])
for sen in sentences:
    X_tok_train_multi.append(preprocess_text(sen))

In [24]:
y_sen_train_multi = df_train_multi['complexity'].tolist()
df_test_multi = pd.read_csv("Dataset/Sub-task 2/lcp_multi_test_labels.tsv",
                              delimiter="\t", quoting=csv.QUOTE_NONE, encoding='utf-8')

In [25]:
X_sen_test_multi = []
sentences = list(df_test_multi['sentence'])
for sen in sentences:
    X_sen_test_multi.append(preprocess_text(sen))
X_tok_test_multi = []
y_sen_test_multi = []

df_test_multi['token'] = df_test_multi['token'].astype(str)
sentences = list(df_test_multi['token'])

for sen in sentences:
    X_tok_test_multi.append(preprocess_text(sen))

In [26]:
y_sen_test_multi = df_test_multi['complexity'].tolist()

## Extracting Features for training data

In [27]:
x_sen_train_multi = []
for line in X_sen_train_multi:
  x = model([line])[0]
  x.numpy()
  x_sen_train_multi.append(x)

x_tok_train_multi = []
for line in X_tok_train_multi:
  x = model([line])[0]
  x.numpy()
  x_tok_train_multi.append(x)

In [28]:
x_sen_train_multi = np.array(x_sen_train_multi)
x_tok_train_multi = np.array(x_tok_train_multi)
x_sen_train_multi.shape

(1616, 512)

## Extracting Features for testing data

In [29]:
x_sen_test_multi = []
for line in X_sen_test_multi:
  x = model([line])[0]
  x.numpy()
  x_sen_test_multi.append(x)

x_tok_test_multi = []
for line in X_tok_test_multi:
  x = model([line])[0]
  x.numpy()
  x_tok_test_multi.append(x)

x_sen_test_multi = np.array(x_sen_test_multi)
x_tok_test_multi = np.array(x_tok_test_multi)

In [30]:
x_train_multi = np.concatenate((x_sen_train_multi, x_tok_train_multi), axis=1)
x_test_multi = np.concatenate((x_sen_test_multi, x_tok_test_multi), axis=1)

In [31]:
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr
from sklearn import svm
reg_model = svm.SVR()
reg_model.fit(x_train_multi, y_sen_train_multi)
y_pred_multi = reg_model.predict(x_test_multi)
pearsonr(y_sen_test_multi, y_pred_multi)
print('\n***********************************************************************************\n')
print('Pearson corelation and p-value for Multi word task:', pearsonr(y_sen_test_multi, y_pred_multi))
print('\n***********************************************************************************\n')


***********************************************************************************

Pearson corelation and p-value for Multi word task: (0.7781217089789983, 1.3233615138572167e-38)

***********************************************************************************



In [32]:
# saving predictions
warnings.filterwarnings('ignore')
df_test_multi['Predicted Complexity'] = 0.00000
for i in range(len(df_test_multi)):
    df_test_multi['Predicted Complexity'][i] = y_pred_multi[i]

df_final_multi = df_test_multi.drop(columns=['complexity','corpus','sentence','token'])
df_final_multi.to_csv('multi_test_predictions.csv',index=False,header=None)