In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv
/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv
/kaggle/input/libraries/textstat-0.7.3-py3-none-any.whl
/kaggle/input/libraries/pyspellchecker-0.8.1-py3-none-any.whl
/kaggle/input/libraries/pyphen-0.15.0-py3-none-any.whl


In [2]:
!pip install "/kaggle/input/libraries/pyphen-0.15.0-py3-none-any.whl"
!pip install "/kaggle/input/libraries/pyspellchecker-0.8.1-py3-none-any.whl"
!pip install "/kaggle/input/libraries/textstat-0.7.3-py3-none-any.whl"

Processing /kaggle/input/libraries/pyphen-0.15.0-py3-none-any.whl
Installing collected packages: pyphen
Successfully installed pyphen-0.15.0
Processing /kaggle/input/libraries/pyspellchecker-0.8.1-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1
Processing /kaggle/input/libraries/textstat-0.7.3-py3-none-any.whl
Installing collected packages: textstat
Successfully installed textstat-0.7.3


In [3]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [4]:
BASE_PATH = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2'

In [5]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from spellchecker import SpellChecker
from textstat import textstat
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import KFold,StratifiedKFold

import pickle # To save models



# 1. Feature Engineering

In [6]:
# FEATURE ENGINEERING

def feature_pipeline():
    df = pd.read_csv(f'{BASE_PATH}/train.csv')
    test_df = pd.read_csv(f'{BASE_PATH}/test.csv')
    
    train_df = df

    print("---------CALCULATING ESSAY LENGTHS---------")
    train_df = calculate_essay_lengths(train_df, "full_text")
    test_df = calculate_essay_lengths(test_df, "full_text")
    print("---------FINISHED CALCULATING ESSAY LENGTHS---------\n")

    print("---------ANALYZING SENTIMENTS---------")
    train_df = analyze_sentiment(train_df, "full_text")
    test_df = analyze_sentiment(test_df, "full_text")
    print("---------FINISHED ANALYZING SENTIMENTS---------\n")

    print("---------ANALYZING READABILITY---------")
    readability_train_df = analyze_readability(train_df, "full_text")
    train_df["fkg_score"] = readability_train_df["Flesch-Kincaid"]
    train_df["gf_score"] = readability_train_df["Gunning Fog"]
    readability_test_df = analyze_readability(test_df, "full_text")
    test_df["fkg_score"] = readability_test_df["Flesch-Kincaid"]
    test_df["gf_score"] = readability_test_df["Gunning Fog"]
    print("---------FINISHED ANALYZING READABILITY---------\n")

    print("---------ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------")
    train_df = lexical_diversity_and_mistakes(train_df, "full_text")
    test_df = lexical_diversity_and_mistakes(test_df, "full_text")
    print("---------FINISHED ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------\n")

    print("---------GETTING DIFFICULT WORD COUNT---------")
    train_df = get_difficult_word_count(train_df, "full_text")
    test_df = get_difficult_word_count(test_df, "full_text")
    print("---------FINISHED GETTING DIFFICULT WORD COUNT---------\n")

    return train_df, test_df

                                                 
# Helper Functions

def calculate_essay_lengths(df, text_column):
    df['char_essay_length'] = df[text_column].apply(len)
    df['words_essay_length'] = df[text_column].apply(lambda x: len(x.split()))
    df['sentence_essay_length'] = df[text_column].apply(lambda x: len(x.split('.')))
    return df

def analyze_sentiment(df, text_column):
    # Initialize the sentiment analyzer
    sid = SentimentIntensityAnalyzer()
    
    # Analyze the sentiment of each essay
    sentiment_scores = df[text_column].apply(lambda x: sid.polarity_scores(x)['compound'])
    
    # Add sentiment scores to the dataframe
    df['sentiment_score'] = sentiment_scores
    
    # Overall sentiment analysis
    positive_count = sum(sentiment_scores > 0)
    negative_count = sum(sentiment_scores < 0)
    neutral_count = len(sentiment_scores) - positive_count - negative_count
    
    # Data for visualization
    categories = ['Positive', 'Negative', 'Neutral']
    counts = [positive_count, negative_count, neutral_count]
    
    
    return df



def analyze_readability(df, text_column):
    # Analyzing readability scores for each essay
    readability_scores = []
    for essay in df[text_column]:
        # Compute Flesch-Kincaid Grade Level
        fkg_score = textstat.flesch_kincaid_grade(essay)
        
        # Compute Gunning Fog Index
        gunning_fog_score = textstat.gunning_fog(essay)
        
        # Add scores to list
        readability_scores.append({'Flesch-Kincaid': fkg_score, 'Gunning Fog': gunning_fog_score})
    
    # Creating a dataframe to store the scores
    readability_df = pd.DataFrame(readability_scores)
    
    return readability_df

# Inspo from https://www.kaggle.com/code/kuangank/ase-fighting
def lexical_diversity_and_mistakes(df, text_column):
    spell_checker = SpellChecker()
    
    lexical_diversities = []
    spelling_mistake_counts = []
    spelling_mistake_ratios = []
    
    for text in df[text_column]:
        tokens = word_tokenize(text)
        unique_tokens = set(tokens)
        
        # Calculate lexical diversity
        if len(tokens) == 0:
            lexical_diversity = 0
        else:
            lexical_diversity = len(unique_tokens) / len(tokens)
        
        # Calculate spelling mistakes
        spelling_mistake_count = len(spell_checker.unknown(token for token in tokens if token.isalpha()))
        
        # Calculate spelling mistake ratio
        if len(tokens) == 0:
            spelling_mistake_ratio = 0
        else:
            spelling_mistake_ratio = spelling_mistake_count / len(tokens)
        
        lexical_diversities.append(lexical_diversity)
        spelling_mistake_counts.append(spelling_mistake_count)
        spelling_mistake_ratios.append(spelling_mistake_ratio)
    
    df['lexical_diversity'] = lexical_diversities
    df['spelling_mistake_count'] = spelling_mistake_counts
    df['spelling_mistake_ratio'] = spelling_mistake_ratios
    
    return df

def get_difficult_word_count(df, text_column):
    difficult_word_counts = []
    difficult_word_ratios = []
    
    for text in df[text_column]:
        tokens = word_tokenize(text)
        
        # Calculate difficult words
        difficult_word_count = textstat.difficult_words(text)
        
        # Calculate difficult word ratio
        if len(tokens) == 0:
            difficult_word_ratio = 0
        else:
            difficult_word_ratio = difficult_word_count / len(tokens)
        
        difficult_word_counts.append(difficult_word_count)
        difficult_word_ratios.append(difficult_word_ratio)
    
    df['difficult_words'] = difficult_word_counts
    df['difficult_word_ratio'] = difficult_word_ratios
    
    return df

    
    train_dense = train_tfidf.toarray()
    
    train_tfidf_df = pd.DataFrame(train_dense, columns=[f'tfid_{i}' for i in range(train_dense.shape[1])])
    
    train_tfidf_df['essay_id'] = train_df['essay_id'].values
    
    train_df = train_df.merge(train_tfidf_df, on='essay_id', how='left')
    
    return train_df, test_df
    

In [7]:
train_feats, test_feats = feature_pipeline()

---------CALCULATING ESSAY LENGTHS---------
---------FINISHED CALCULATING ESSAY LENGTHS---------

---------ANALYZING SENTIMENTS---------
---------FINISHED ANALYZING SENTIMENTS---------

---------ANALYZING READABILITY---------
---------FINISHED ANALYZING READABILITY---------

---------ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------
---------FINISHED ANALYZING LEXICAL DIVERSITY AND SPELLING MISTAKES---------

---------GETTING DIFFICULT WORD COUNT---------
---------FINISHED GETTING DIFFICULT WORD COUNT---------



In [8]:
train_df = train_feats.copy()
test_df = test_feats.copy()

# Save
train_df.to_csv('train_df.csv', index=False)
test_df.to_csv('test_df.csv', index = False)


In [9]:
train_df.head

<bound method NDFrame.head of       essay_id                                          full_text  score  \
0      000d118  Many people have car where they live. The thin...      3   
1      000fe60  I am a scientist at NASA that is discussing th...      3   
2      001ab80  People always wish they had the same technolog...      4   
3      001bdc0  We all heard about Venus, the planet without a...      4   
4      002ba53  Dear, State Senator\n\nThis is a letter to arg...      3   
...        ...                                                ...    ...   
17302  ffd378d  the story " The Challenge of Exploing Venus " ...      2   
17303  ffddf1f  Technology has changed a lot of ways that we l...      4   
17304  fff016d  If you don't like sitting around all day than ...      2   
17305  fffb49b  In "The Challenge of Exporing Venus," the auth...      1   
17306  fffed3e  Venus is worthy place to study but dangerous. ...      2   

       char_essay_length  words_essay_length  sentence_es

In [10]:
train_df.shape

(17307, 14)

In [11]:
train_df = train_df.dropna()

# Verify that there are no more NaN values
print(train_df.isnull().sum().sum())


0


In [12]:
train_df.shape

(17307, 14)

In [13]:
# Separate features and target variable from training and validation data
X_train = train_df.drop(columns=['essay_id', 'full_text','score'])  # Drop target column
y_train = train_df['score']  # Target column

test = test_df.drop(columns=['essay_id', 'full_text'])

## trained model

In [14]:
import pickle
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)

# Train the model
rf_model.fit(X_train, y_train)

# Save the trained model to a file using pickle
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

In [15]:
with open("random_forest_model.pkl", "rb") as f:
    loaded_rf_model = pickle.load(f)

# Make predictions on the validation set using the loaded model
y_pred = loaded_rf_model.predict(test)

# Assuming `test` DataFrame has an 'id' column and your predictions are stored in `y_pred`
submission_df = pd.DataFrame({
    'id': test_df['essay_id'],  # Make sure 'id' matches the name of your ID column
    'prediction': y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)


In [16]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score
# import numpy as np
# import matplotlib.pyplot as plt

# # Define a range of values for n_estimators
# n_estimators_range = [10, 50, 100, 200, 500, 1000]

# # Evaluate the model using cross-validation with parallel processing
# scores = []
# for n_estimators in n_estimators_range:
#     rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42, n_jobs=-1)  # n_jobs=-1 uses all available cores
#     score = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()  # n_jobs=-1 uses all available cores
#     scores.append(score)
#     print(f'n_estimators: {n_estimators}, Cross-Validation Accuracy: {score:.4f}')

# # Plot the results
# plt.plot(n_estimators_range, scores)
# plt.xlabel('Number of Trees')
# plt.ylabel('Cross-Validation Accuracy')
# plt.title('Effect of Number of Trees on Model Performance')
# plt.show()
