In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import word_tokenize   # module for tokenizing strings
from tqdm import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [2]:
DATA_DIR = "../input/feedback-prize-english-language-learning/"
os.listdir(DATA_DIR)

['sample_submission.csv', 'train.csv', 'test.csv']

In [3]:
train_data = pd.read_csv(DATA_DIR+'train.csv')
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [4]:
test_data = pd.read_csv(DATA_DIR+'test.csv')
test_data.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


# Split into train and validation sets
- To validate our models and hypothesis before we can test

In [5]:

# train_df, test_df = train_test_split(
#     train_data, train_size=0.8, random_state=42)

# EDA 
We want to understand the scores of each metric in the evaluation criteria.

## Visualize vocab score for the essays

In [6]:
# vocab_subset = train_df[['full_text', 'vocabulary']]
# vocab_subset.shape

In [7]:
def remove_punctuation(text):
    return "".join(["" if ch in string.punctuation else ch.lower() for ch in text])
stopwords_english = set(stopwords.words('english'))
def clean_words(headline):
    return [
    word for word in headline
    if word not in stopwords_english
]  
stemmer = PorterStemmer()
def words_stems(headline):
    return [
    stemmer.stem(word) for word in headline
]
def tokenize_text(text):
    return word_tokenize(text)
def remove_numbers(text):
    return re.sub("[^a-zA-Z]", " ", text)

In [8]:
train_data['full_text'] = train_data['full_text'].apply(remove_punctuation).apply(remove_numbers).apply(tokenize_text).apply(clean_words)
train_data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,"[think, students, would, benefit, learning, ho...",3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,"[problem, change, let, best, matter, happening...",2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"[dear, principal, u, change, school, policy, g...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,"[best, time, life, become, agree, greatest, ac...",4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,"[small, act, kindness, impact, people, change,...",2.5,3.0,3.0,3.0,2.5,2.5


In [9]:
corpus = []

In [10]:
for index, row in tqdm(train_data.iterrows()):
    vocab = " ".join([token for token in row['full_text']])
    corpus.append(vocab)

3911it [00:00, 13918.71it/s]


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [13]:
vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             ngram_range = (1,2),
                             max_features = 5000) 

In [14]:
X = vectorizer.fit_transform(corpus)

In [15]:
encoded_features = X.toarray()

In [16]:
encoded_features.shape

(3911, 5000)

In [17]:
vectorizer.get_feature_names_out()

array(['abilities', 'ability', 'able', ..., 'youve', 'zone', 'zoo'],
      dtype=object)

In [18]:
y_train = train_data[['cohesion', 'syntax','vocabulary','phraseology','grammar','conventions']]
model = MultiOutputRegressor(Ridge(random_state=123))
model.fit(encoded_features, y_train)

MultiOutputRegressor(estimator=Ridge(random_state=123))

In [19]:
test_data['full_text'] = test_data['full_text'].apply(remove_punctuation).apply(remove_numbers).apply(tokenize_text).apply(clean_words)
test_data.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,"[person, experience, job, always, going, good,..."
1,000BAD50D026,"[think, students, would, benefit, able, attend..."
2,00367BB2546B,"[thomas, jefferson, states, wonderful, much, g..."


In [20]:
#corpus_valid = []
corpus_test = []

In [21]:
# for index, row in tqdm(test_df.iterrows()):
#     vocab = " ".join([token for token in row['full_text']])
#     corpus_valid.append(vocab)

In [22]:
for index, row in tqdm(test_data.iterrows()):
    vocab = " ".join([token for token in row['full_text']])
    corpus_test.append(vocab)

3it [00:00, 1392.84it/s]


In [23]:
#valid_encoded = vectorizer.transform(corpus_valid)
test_features = vectorizer.transform(corpus_test)

In [24]:
#y_test = test_df[['cohesion', 'syntax','vocabulary','phraseology','grammar','conventions']]

In [25]:
#model.score(valid_encoded, y_test)
preds = model.predict(test_features)

In [26]:
preds = np.round((preds*2)) / 2

In [27]:
preds

array([[3. , 3. , 3.5, 3. , 2.5, 3. ],
       [2.5, 2.5, 2.5, 2.5, 2.5, 3. ],
       [3.5, 3.5, 3.5, 3.5, 3.5, 3. ]])

In [28]:
sub_data = pd.read_csv(DATA_DIR+'sample_submission.csv')
sub_data.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


In [29]:
sub_data[['cohesion', 'syntax','vocabulary','phraseology','grammar','conventions']] = preds

In [30]:
sub_data.to_csv('submission.csv',  header=True, index=False)

build a 3 corpus of score greater than 3.5 and less and score greater than 4. Visualize the vocab and try to see if there are some differences

In [31]:
# corpus_cutoff = set()
# corpus_perfect = set()
# corpus_low = set()
# for i, row in tqdm(vocab_subset.iterrows()):
#     if row['vocabulary'] >= 3.5:
#         for word in row['full_text']:
#             corpus_cutoff.add(word)

In [32]:
# for i, row in tqdm(vocab_subset.iterrows()):
#     if row['vocabulary'] < 3.5:
#         for word in row['full_text']:
#             corpus_low.add(word)

In [33]:
# for i, row in tqdm(vocab_subset.iterrows()):
#     if row['vocabulary'] > 4.0:
#         for word in row['full_text']:
#             corpus_perfect.add(word)

In [34]:
# df_words = pd.DataFrame(zip(list(corpus_low), list(corpus_cutoff), list(corpus_perfect)), columns=['low', 'cutoff', 'high'])
# df_words.head()

In [35]:
#df_words.to_csv('vocab.csv')

In [36]:
# plt.subplots(figsize = (8,8))

# wordcloud = WordCloud (
#                     background_color = 'white',
#                     width = 512,
#                     height = 384
#                         ).generate(' '.join(df_words['low']))
# plt.imshow(wordcloud) # image show
# plt.axis('off') # to off the axis of x and y
# plt.show()

In [37]:
# plt.subplots(figsize = (8,8))

# wordcloud = WordCloud (
#                     background_color = 'white',
#                     width = 512,
#                     height = 384
#                         ).generate(' '.join(df_words['high']))
# plt.imshow(wordcloud) # image show
# plt.axis('off') # to off the axis of x and y
# plt.show()