In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import word_tokenize   # module for tokenizing strings
from tqdm import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [3]:
DATA_DIR = "../input/feedback-prize-english-language-learning/"
os.listdir(DATA_DIR)

['sample_submission.csv', 'train.csv', 'test.csv']

In [4]:
train_data = pd.read_csv(DATA_DIR+'train.csv')
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [5]:
test_data = pd.read_csv(DATA_DIR+'test.csv')
test_data.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


# Split into train and validation sets
- To validate our models and hypothesis before we can test

In [6]:

train_df, test_df = train_test_split(
    train_data, train_size=0.8, random_state=42)

# EDA 
We want to understand the scores of each metric in the evaluation criteria.

## Visualize vocab score for the essays

In [7]:
# vocab_subset = train_df[['full_text', 'vocabulary']]
# vocab_subset.shape

In [8]:
def remove_punctuation(text):
    return "".join(["" if ch in string.punctuation else ch.lower() for ch in text])
stopwords_english = set(stopwords.words('english'))
def clean_words(headline):
    return [
    word for word in headline
    if word not in stopwords_english
]  
stemmer = PorterStemmer()
def words_stems(headline):
    return [
    stemmer.stem(word) for word in headline
]
def tokenize_text(text):
    return word_tokenize(text)
def remove_numbers(text):
    return re.sub("[^a-zA-Z]", " ", text)

In [9]:
train_df['full_text'] = train_df['full_text'].apply(remove_punctuation).apply(remove_numbers).apply(tokenize_text).apply(clean_words)
train_df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
247,1247CB034EF7,"[wouldnt, want, time, homework, school, wouldn...",4.0,3.5,3.0,3.5,4.0,3.5
1360,68685615FE0C,"[debate, opportunity, offered, schools, studen...",3.5,4.0,3.5,3.5,2.5,3.0
3318,E597A35FA323,"[negative, take, information, books, play, gam...",2.0,2.5,2.5,2.5,2.5,2.5
2337,AD9CEE5A6FFF,"[think, better, talk, people, one, person, bec...",3.0,3.0,3.0,3.0,3.0,2.5
3631,F4C52358CE03,"[reasons, churchills, statement, agree, many, ...",2.5,2.5,2.5,2.0,2.5,2.5


In [17]:
corpus = []

In [18]:
for index, row in tqdm(train_df.iterrows()):
    vocab = " ".join([token for token in row['full_text']])
    corpus.append(vocab)

3128it [00:00, 13295.24it/s]


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

In [22]:
vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 

In [23]:
X = vectorizer.fit_transform(corpus)

In [24]:
encoded_features = X.toarray()

In [26]:
encoded_features.shape

(3128, 5000)

In [27]:
vectorizer.get_feature_names_out()

array(['abandoned', 'abilities', 'ability', ..., 'zero', 'zone', 'zoo'],
      dtype=object)

In [30]:
y_train = train_df[['cohesion', 'syntax','vocabulary','phraseology','grammar','conventions']]
model = LinearRegression()
model.fit(encoded_features, y_train)

LinearRegression()

In [32]:
test_df['full_text'] = test_df['full_text'].apply(remove_punctuation).apply(remove_numbers).apply(tokenize_text).apply(clean_words)
test_df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
1552,772D27D400BB,"[god, possitive, attitude, dont, possitive, ac...",3.0,2.5,2.5,2.0,2.0,2.0
2114,9E8F3C6405CA,"[people, ask, one, person, advice, think, peop...",3.0,2.0,3.0,3.5,3.0,3.0
1965,948771F795EB,"[accomplish, active, always, something, want, ...",4.0,4.0,3.0,4.0,4.0,4.0
3856,FE14D7378CFB,"[agree, disagree, imagination, important, know...",3.0,3.0,3.5,3.0,3.5,3.5
1610,7AAE019F70D6,"[disagree, principal, saying, kids, least, joi...",3.5,3.5,3.5,3.5,3.0,3.5


In [33]:
corpus_valid = []

In [34]:
for index, row in tqdm(test_df.iterrows()):
    vocab = " ".join([token for token in row['full_text']])
    corpus_valid.append(vocab)

783it [00:00, 12643.76it/s]


In [35]:
valid_encoded = vectorizer.transform(corpus_valid)

In [36]:
y_test = test_df[['cohesion', 'syntax','vocabulary','phraseology','grammar','conventions']]

In [37]:
model.score(valid_encoded, y_test)

-1.2867133493181113

build a 3 corpus of score greater than 3.5 and less and score greater than 4. Visualize the vocab and try to see if there are some differences

In [10]:
# corpus_cutoff = set()
# corpus_perfect = set()
# corpus_low = set()
# for i, row in tqdm(vocab_subset.iterrows()):
#     if row['vocabulary'] >= 3.5:
#         for word in row['full_text']:
#             corpus_cutoff.add(word)

In [11]:
# for i, row in tqdm(vocab_subset.iterrows()):
#     if row['vocabulary'] < 3.5:
#         for word in row['full_text']:
#             corpus_low.add(word)

In [12]:
# for i, row in tqdm(vocab_subset.iterrows()):
#     if row['vocabulary'] > 4.0:
#         for word in row['full_text']:
#             corpus_perfect.add(word)

In [13]:
# df_words = pd.DataFrame(zip(list(corpus_low), list(corpus_cutoff), list(corpus_perfect)), columns=['low', 'cutoff', 'high'])
# df_words.head()

In [14]:
#df_words.to_csv('vocab.csv')

In [15]:
# plt.subplots(figsize = (8,8))

# wordcloud = WordCloud (
#                     background_color = 'white',
#                     width = 512,
#                     height = 384
#                         ).generate(' '.join(df_words['low']))
# plt.imshow(wordcloud) # image show
# plt.axis('off') # to off the axis of x and y
# plt.show()

In [16]:
# plt.subplots(figsize = (8,8))

# wordcloud = WordCloud (
#                     background_color = 'white',
#                     width = 512,
#                     height = 384
#                         ).generate(' '.join(df_words['high']))
# plt.imshow(wordcloud) # image show
# plt.axis('off') # to off the axis of x and y
# plt.show()