In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv
/kaggle/input/pyphen/pyphen-0.15.0-py3-none-any.whl
/kaggle/input/preprocessed-data/preprocessed_train.csv
/kaggle/input/preprocessed-data/preprocessed_valid.csv
/kaggle/input/textstat/textstat-0.7.3-py3-none-any.whl
/kaggle/input/xgb-model/xgb_aes_model.pkl
/kaggle/input/xgb-model/vectorizer.pk
/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl


In [2]:
!pip install "/kaggle/input/pyphen/pyphen-0.15.0-py3-none-any.whl"
!pip install "/kaggle/input/textstat/textstat-0.7.3-py3-none-any.whl"
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

Processing /kaggle/input/pyphen/pyphen-0.15.0-py3-none-any.whl
Installing collected packages: pyphen
Successfully installed pyphen-0.15.0
Processing /kaggle/input/textstat/textstat-0.7.3-py3-none-any.whl
Installing collected packages: textstat
Successfully installed textstat-0.7.3
Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [3]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [4]:
BASE_PATH = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2'

In [5]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from spellchecker import SpellChecker
from textstat import textstat
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from lightgbm import log_evaluation, early_stopping
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score

import pickle # To save models

# Ensure you have the required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')




[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


False

# 1. Feature Engineering

In [6]:
# FEATURE ENGINEERING

def feature_pipeline():
    df = pd.read_csv(f'{BASE_PATH}/train.csv')
    
    train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df["score"])

    print("---------CALCULATING ESSAY LENGTHS---------")
    train_df = calculate_essay_lengths(train_df, "full_text")
    valid_df = calculate_essay_lengths(valid_df, "full_text")
    print("---------FINISHED CALCULATING ESSAY LENGTHS---------\n")

    print("---------ANALYZING SENTIMENTS---------")
    train_df = analyze_sentiment(train_df, "full_text")
    valid_df = analyze_sentiment(valid_df, "full_text")
    print("---------FINISHED ANALYZING SENTIMENTS---------\n")

    print("---------ANALYZING READABILITY---------")
    readability_train_df = analyze_readability(train_df, "full_text")
    readability_valid_df = analyze_readability(valid_df, "full_text")
    train_df["fkg_score"] = readability_train_df["Flesch-Kincaid"]
    train_df["gf_score"] = readability_train_df["Gunning Fog"]
    valid_df["fkg_score"] = readability_valid_df["Flesch-Kincaid"]
    valid_df["gf_score"] = readability_valid_df["Gunning Fog"]
    print("---------FINISHED ANALYZING READABILITY---------\n")

    print("---------ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------")
    train_df = lexical_diversity_and_mistakes(train_df, "full_text")
    valid_df = lexical_diversity_and_mistakes(valid_df, "full_text")
    print("---------FINISHED ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------\n")

    print("---------GETTING DIFFICULT WORD COUNT---------")
    train_df = get_difficult_word_count(train_df, "full_text")
    valid_df = get_difficult_word_count(valid_df, "full_text")
    print("---------FINISHED GETTING DIFFICULT WORD COUNT---------\n")

    print("---------STARTING PREPROCESSING---------")
    train_df["full_text"] = train_df["full_text"].apply(preprocess_text)
    valid_df["full_text"] = valid_df["full_text"].apply(preprocess_text)
    print("---------FINISHED PREPROCESSING---------\n")

    print("---------STARTING VECTORIZATION OF TEXTS USING TFIDF---------")
    train_df, valid_df, vectorizer = add_tfidf_features(train_df, valid_df, "full_text")
    print("DONE\n")

    return train_df, valid_df, vectorizer


   
                                                 
# Helper Functions

def calculate_essay_lengths(df, text_column):
    df['char_essay_length'] = df[text_column].apply(len)
    df['words_essay_length'] = df[text_column].apply(lambda x: len(x.split()))
    df['sentence_essay_length'] = df[text_column].apply(lambda x: len(x.split('.')))
    return df

def analyze_sentiment(df, text_column):
    # Initialize the sentiment analyzer
    sid = SentimentIntensityAnalyzer()
    
    # Analyze the sentiment of each essay
    sentiment_scores = df[text_column].apply(lambda x: sid.polarity_scores(x)['compound'])
    
    # Add sentiment scores to the dataframe
    df['sentiment_score'] = sentiment_scores
    
    # Overall sentiment analysis
    positive_count = sum(sentiment_scores > 0)
    negative_count = sum(sentiment_scores < 0)
    neutral_count = len(sentiment_scores) - positive_count - negative_count
    
    # Data for visualization
    categories = ['Positive', 'Negative', 'Neutral']
    counts = [positive_count, negative_count, neutral_count]
    
    
    return df



def analyze_readability(df, text_column):
    # Analyzing readability scores for each essay
    readability_scores = []
    for essay in df[text_column]:
        # Compute Flesch-Kincaid Grade Level
        fkg_score = textstat.flesch_kincaid_grade(essay)
        
        # Compute Gunning Fog Index
        gunning_fog_score = textstat.gunning_fog(essay)
        
        # Add scores to list
        readability_scores.append({'Flesch-Kincaid': fkg_score, 'Gunning Fog': gunning_fog_score})
    
    # Creating a dataframe to store the scores
    readability_df = pd.DataFrame(readability_scores)
    
    return readability_df

# Inspo from https://www.kaggle.com/code/kuangank/ase-fighting
def lexical_diversity_and_mistakes(df, text_column):
    spell_checker = SpellChecker()
    
    lexical_diversities = []
    spelling_mistake_counts = []
    spelling_mistake_ratios = []
    
    for text in df[text_column]:
        tokens = word_tokenize(text)
        unique_tokens = set(tokens)
        
        # Calculate lexical diversity
        if len(tokens) == 0:
            lexical_diversity = 0
        else:
            lexical_diversity = len(unique_tokens) / len(tokens)
        
        # Calculate spelling mistakes
        spelling_mistake_count = len(spell_checker.unknown(token for token in tokens if token.isalpha()))
        
        # Calculate spelling mistake ratio
        if len(tokens) == 0:
            spelling_mistake_ratio = 0
        else:
            spelling_mistake_ratio = spelling_mistake_count / len(tokens)
        
        lexical_diversities.append(lexical_diversity)
        spelling_mistake_counts.append(spelling_mistake_count)
        spelling_mistake_ratios.append(spelling_mistake_ratio)
    
    df['lexical_diversity'] = lexical_diversities
    df['spelling_mistake_count'] = spelling_mistake_counts
    df['spelling_mistake_ratio'] = spelling_mistake_ratios
    
    return df

def get_difficult_word_count(df, text_column):
    difficult_word_counts = []
    difficult_word_ratios = []
    
    for text in df[text_column]:
        tokens = word_tokenize(text)
        
        # Calculate difficult words
        difficult_word_count = textstat.difficult_words(text)
        
        # Calculate difficult word ratio
        if len(tokens) == 0:
            difficult_word_ratio = 0
        else:
            difficult_word_ratio = difficult_word_count / len(tokens)
        
        difficult_word_counts.append(difficult_word_count)
        difficult_word_ratios.append(difficult_word_ratio)
    
    df['difficult_words'] = difficult_word_counts
    df['difficult_word_ratio'] = difficult_word_ratios
    
    return df

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Inspo from: https://www.kaggle.com/code/lebinhthanh/baseline-tfidf-lgbm

def add_tfidf_features(train_df, valid_df, text_column):
    vectorizer = TfidfVectorizer(
        tokenizer=word_tokenize,
        token_pattern=None,
        strip_accents='unicode',
        analyzer='word',
        ngram_range=(1, 3),
        min_df=0.05,
        max_df=0.95,
        sublinear_tf=True,
    )
    
    train_tfidf = vectorizer.fit_transform(train_df[text_column])
    valid_tfidf = vectorizer.transform(valid_df[text_column])
    
    train_dense = train_tfidf.toarray()
    valid_dense = valid_tfidf.toarray()
    
    train_tfidf_df = pd.DataFrame(train_dense, columns=[f'tfid_{i}' for i in range(train_dense.shape[1])])
    valid_tfidf_df = pd.DataFrame(valid_dense, columns=[f'tfid_{i}' for i in range(valid_dense.shape[1])])
    
    train_tfidf_df['essay_id'] = train_df['essay_id'].values
    valid_tfidf_df['essay_id'] = valid_df['essay_id'].values
    
    train_df = train_df.merge(train_tfidf_df, on='essay_id', how='left')
    valid_df = valid_df.merge(valid_tfidf_df, on='essay_id', how='left')
    
    return train_df, valid_df, vectorizer
    

In [7]:
# train_feats, valid_feats, vectorizer = feature_pipeline()

In [8]:
# Save preprocessed/engineered df


# preprocessed_train_df = pre_train.copy()
# preprocessed_valid_df = pre_valid.copy()

# Save
# preprocessed_train_df.to_csv('preprocessed_train.csv',index=False)
# preprocessed_valid_df.to_csv('preprocessed_valid.csv',index=False)



# 2. XGBOOST

In [9]:
# train_feats =  pd.read_csv(f'/kaggle/input/preprocessed-data/preprocessed_train.csv')
# valid_feats = pd.read_csv(f'/kaggle/input/preprocessed-data/preprocessed_valid.csv')

In [10]:
'''train_feats = train_feats.drop(["essay_id", "full_text"], axis = 1)
feature_names = train_feats.columns

valid_feats = valid_feats.drop(["essay_id", "full_text"], axis = 1)
v_feature_names = valid_feats.columns

train_feats["score"] = train_feats["score"] - 1
valid_feats["score"] = valid_feats["score"] - 1'''

'train_feats = train_feats.drop(["essay_id", "full_text"], axis = 1)\nfeature_names = train_feats.columns\n\nvalid_feats = valid_feats.drop(["essay_id", "full_text"], axis = 1)\nv_feature_names = valid_feats.columns\n\ntrain_feats["score"] = train_feats["score"] - 1\nvalid_feats["score"] = valid_feats["score"] - 1'

In [11]:
# Inspiration From: https://www.kaggle.com/code/mobenmo/get-started-xgboost-eda-0-55-score

def Cmatrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted kappa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = Cmatrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 'kappa',(1.0 - numerator / denominator)


In [12]:
'''

# defining an xgboost regressor
# regularisation parameters taken from: https://www.kaggle.com/code/lebinhthanh/baseline-tfidf-lgbm/notebook#Features-engineering
xg_reg = xgb.XGBClassifier(
            objective='multi:softmax',
            num_class = 6,
            tree_method="hist",
            n_estimators=2000,
            learning_rate=0.0075,
            reg_lambda = 0.1,
            reg_alpha = 0.8,
            max_leaves = 17,
            subsample=0.50,
            colsample_bytree=0.50,
            max_bin=4096,
            n_jobs=2,
            feval=quadratic_weighted_kappa,
            #eval_metric = quadratic_weighted_kappa,
            #eval_metric='auc',
            early_stopping_rounds=70,
        )
feature_names = [col for col in train_feats.columns if col not in ['essay_id', 'full_text', 'score']]
xg_reg.fit(train_feats[feature_names], train_feats["score"], 
           eval_set=[(train_feats[feature_names], train_feats["score"]), (valid_feats[feature_names], valid_feats["score"])], 
           verbose=200)
           
'''

'\n\n# defining an xgboost regressor\n# regularisation parameters taken from: https://www.kaggle.com/code/lebinhthanh/baseline-tfidf-lgbm/notebook#Features-engineering\nxg_reg = xgb.XGBClassifier(\n            objective=\'multi:softmax\',\n            num_class = 6,\n            tree_method="hist",\n            n_estimators=2000,\n            learning_rate=0.0075,\n            reg_lambda = 0.1,\n            reg_alpha = 0.8,\n            max_leaves = 17,\n            subsample=0.50,\n            colsample_bytree=0.50,\n            max_bin=4096,\n            n_jobs=2,\n            feval=quadratic_weighted_kappa,\n            #eval_metric = quadratic_weighted_kappa,\n            #eval_metric=\'auc\',\n            early_stopping_rounds=70,\n        )\nfeature_names = [col for col in train_feats.columns if col not in [\'essay_id\', \'full_text\', \'score\']]\nxg_reg.fit(train_feats[feature_names], train_feats["score"], \n           eval_set=[(train_feats[feature_names], train_feats["score"]

In [13]:
# Save the vectorizer and model
#with open("vectorizer.pkl", "wb") as f:
#    pickle.dump(vectorizer, f)

#with open(f"xgb_aes_model.pkl", "wb") as f:
#    pickle.dump(xg_reg, f)

In [14]:
#train_feats.head()

Submission

In [15]:
def xgb_testing_pipeline():
    # Load the test data
    test_df = pd.read_csv(f"{BASE_PATH}/test.csv")

    print("---------CALCULATING ESSAY LENGTHS---------")
    test_df = calculate_essay_lengths(test_df, "full_text")
    print("---------FINISHED CALCULATING ESSAY LENGTHS---------\n")

    print("---------ANALYZING SENTIMENTS---------")
    test_df = analyze_sentiment(test_df, "full_text")
    print("---------FINISHED ANALYZING SENTIMENTS---------\n")

    print("---------ANALYZING READABILITY---------")
    readability_test_df = analyze_readability(test_df, "full_text")
    test_df["fkg_score"] = readability_test_df["Flesch-Kincaid"]
    test_df["gf_score"] = readability_test_df["Gunning Fog"]
    print("---------FINISHED ANALYZING READABILITY---------\n")

    print("---------ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------")
    test_df = lexical_diversity_and_mistakes(test_df, "full_text")
    print("---------FINISHED ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------\n")

    print("---------GETTING DIFFICULT WORD COUNT---------")
    test_df = get_difficult_word_count(test_df, "full_text")
    print("---------FINISHED GETTING DIFFICULT WORD COUNT---------\n")

    print("---------STARTING PREPROCESSING---------")
    test_df["full_text"] = test_df["full_text"].apply(preprocess_text)
    print("---------FINISHED PREPROCESSING---------\n")

    print("---------STARTING VECTORIZATION OF TEXTS USING TFIDF---------")
    test_df = add_tfidf_features_to_test(test_df, "full_text")
    print("DONE\n")

    # Load the trained model
    with open(f"/kaggle/input/xgb-model/xgb_aes_model.pkl", "rb") as f:
        xg_reg = pickle.load(f)

    # Generating predictions
    feature_names = [col for col in test_df.columns if col not in ['essay_id', 'full_text']]  # Exclude non-feature columns
    predictions = xg_reg.predict(test_df[feature_names])

    return predictions

def add_tfidf_features_to_test(test_df, text_column):
    # Load the fitted TF-IDF vectorizer from your training phase
    with open(f"/kaggle/input/xgb-model/vectorizer.pk", "rb") as f:
        tfidf_vectorizer = pickle.load(f)

    # Transform the test text data using the loaded vectorizer
    test_tfidf = tfidf_vectorizer.transform(test_df[text_column])
    
    # Convert to dense matrix
    test_dense = test_tfidf.toarray()
    
    # Convert to DataFrame
    test_tfidf_df = pd.DataFrame(test_dense, columns=[f'tfid_{i}' for i in range(test_dense.shape[1])])
    
    # Add essay IDs for merging
    test_tfidf_df['essay_id'] = test_df['essay_id'].values
    
    # Merge with original data
    test_df = test_df.merge(test_tfidf_df, on='essay_id', how='left')
    
    return test_df

# Run the testing pipeline
sub_df = pd.read_csv(f"{BASE_PATH}/test.csv")[["essay_id"]].copy()
sub_df["score"] = xgb_testing_pipeline()

# Save the submission DataFrame to a CSV file
sub_df.to_csv('submission.csv', index=False)

# Display the first 2 rows of the submission DataFrame
print(sub_df.head())
    

---------CALCULATING ESSAY LENGTHS---------
---------FINISHED CALCULATING ESSAY LENGTHS---------

---------ANALYZING SENTIMENTS---------
---------FINISHED ANALYZING SENTIMENTS---------

---------ANALYZING READABILITY---------
---------FINISHED ANALYZING READABILITY---------

---------ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------
---------FINISHED ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------

---------GETTING DIFFICULT WORD COUNT---------
---------FINISHED GETTING DIFFICULT WORD COUNT---------

---------STARTING PREPROCESSING---------
---------FINISHED PREPROCESSING---------

---------STARTING VECTORIZATION OF TEXTS USING TFIDF---------
DONE

  essay_id  score
0  000d118      2
1  000fe60      2
2  001ab80      3
