In [17]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import papermill as pm

In [29]:
# set n-gram N:
N = 3 # default value, will get overwritten by papermill
DATA_PATH = "data/submission_metadata.csv"

In [30]:
def generate_ngram_features(data, n):
    # Initialize CountVectorizer with desired n-gram range
    vectorizer = CountVectorizer(ngram_range=(n, n), binary=True)
    # Fit and transform the data
    X = vectorizer.fit_transform(data)
    # Get feature names
    feature_names = vectorizer.get_feature_names_out()
    return X, feature_names

In [31]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Name,Student ID,Email,Status,Submission ID,Total Score,Max Points,Question 1 Score,Question 1 Weight,Question 1 Graded?,Question 1 Response,Question 1 Submitted At
0,Andrew Pan,A10001,andrep24@uw.edu,Ungraded,231798102,0.0,1.0,,1.0,False,Binary search trees represent a sophisticated ...,2024-02-17 14:04:35 -0800


In [32]:
text_data = df['Question 1 Response'].tolist()
text_data

['Binary search trees represent a sophisticated data structure pivotal for managing and manipulating large datasets with optimal efficiency. Their function transcends simple storage; they offer a systematic arrangement of elements, allowing logarithmic time complexity for search operations through efficient partitioning of the data space. Furthermore, binary search trees facilitate dynamic operations such as insertion and deletion while maintaining their balanced structure, making them indispensable in domains where performance and scalability are paramount, such as database management systems and network routing algorithms.']

In [33]:
# Generate n-gram features
X, feature_names = generate_ngram_features(text_data, N)

# Convert the result into a DataFrame for better visualization
ngram_df = pd.DataFrame(X.toarray(), columns=feature_names)

# Output the resulting DataFrame
ngram_df

Unnamed: 0,allowing logarithmic time,and deletion while,and manipulating large,and network routing,and scalability are,are paramount such,arrangement of elements,as database management,as insertion and,balanced structure making,...,them indispensable in,they offer systematic,through efficient partitioning,time complexity for,transcends simple storage,trees facilitate dynamic,trees represent sophisticated,where performance and,while maintaining their,with optimal efficiency
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [23]:
print("test")

test


In [15]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus  import stopwords
import re

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

def generate_ngram_tfidf_pipeline(data, labels, n):
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(ngram_range=(n, n), binary=True)),
        ('tfidf', TfidfTransformer()),
        ('classifier', RandomForestClassifier())
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    # print(f'Classification Score for n={n} ->{classification_report(y_test, y_pred)}')
    print(f'Accuracy Score for n={n} -> {accuracy_score(y_test, y_pred)}')

    
    return pipeline

def main():

    def cleaning(df):
        lowered=df.lower() # lowering the sentences 
        removed = re.sub(r'[^a-z]',' ',lowered)  # replacing the non alphabets with space 
        splitted=removed.split(' ')   # splitting the sentences by spaces to lemmatize
        df = [WordNetLemmatizer().lemmatize(word) for word in splitted 
            if word not in stopwords.words('english')]  # lemmatizing and removing stopwords
        df =' '.join(df) # joining back the words of list
        return(removed)

    csv_file = 'data/mohler_dataset_edited.csv'
    data = pd.read_csv(csv_file)
    data['desired_answer'] = data['desired_answer'].apply(cleaning)
    data['student_answer'] = data['student_answer'].apply(cleaning)

    
    text_data = data['student_answer'].tolist()
    labels = data['desired_answer'].tolist() 
    
    # n = 3
    num_n = 10
    for n in range(num_n):
        generate_ngram_tfidf_pipeline(text_data, labels, n)

    
    # ngram_tfidf_pipeline = generate_ngram_tfidf_pipeline(text_data, labels, n)

if __name__ == "__main__":
    main()

Accuracy Score for n=0 -> 0.024175824175824177
Accuracy Score for n=1 -> 0.6879120879120879
Accuracy Score for n=2 -> 0.589010989010989
Accuracy Score for n=3 -> 0.46813186813186813
Accuracy Score for n=4 -> 0.33186813186813185
Accuracy Score for n=5 -> 0.23516483516483516
Accuracy Score for n=6 -> 0.16923076923076924
Accuracy Score for n=7 -> 0.12307692307692308
Accuracy Score for n=8 -> 0.1054945054945055
Accuracy Score for n=9 -> 0.0945054945054945
