<a href="https://colab.research.google.com/github/poornaditya1/Essgraster/blob/master/Essgraster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset : https://www.kaggle.com/c/asap-aes/data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config
import keras.backend as K
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import cohen_kappa_score
import math
from gensim.test.utils import datapath

In [None]:
#X = pd.read_csv('/content/drive/MyDrive/contents/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
X = pd.read_csv('/content/drive/MyDrive/Cicada3301/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
y = X['domain1_score']
X = X.dropna(axis=1)
X = X.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [None]:
X.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [None]:
minimum_scores = [-1, 2, 1, 0, 0, 0, 0, 0, 0]
maximum_scores = [-1, 12, 6, 3, 3, 4, 4, 30, 60]

In [None]:
def get_model():
    """Define the model."""
    model = Sequential()
    model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))
    model.add(LSTM(64, recurrent_dropout=0.4))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='relu'))

    model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])
    model.summary()

    return model

All function definition part will come here


In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#I have added a text to indicate where this block has to be placed
def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

In [None]:
cv = KFold(n_splits=5, shuffle=True)
results = []
y_pred_list = []

count = 1
for traincv, testcv in cv.split(X):
    print("\n--------Fold {}--------\n".format(count))
    X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]
    
    train_essays = X_train['essay']
    test_essays = X_test['essay']
    
    sentences = []
    
    for essay in train_essays:
            # Obtaining all sentences from the training essays.
            sentences += essay_to_sentences(essay, remove_stopwords = True)
            
    # Initializing variables for word2vec model.
    num_features = 300 
    min_word_count = 40
    num_workers = 4
    context = 10
    downsampling = 1e-3

    print("Training Word2Vec Model...")
    model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

    model.init_sims(replace=True)
    model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)
    model.save("/content/drive/MyDrive/Cicada3301/model_weights/word2vec.model")

    clean_train_essays = []
    
    # Generate training and testing data word vectors.
    for essay_v in train_essays:
        clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))
    trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)
    
    clean_test_essays = []
    for essay_v in test_essays:
        clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))
    testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
    
    trainDataVecs = np.array(trainDataVecs)
    testDataVecs = np.array(testDataVecs)
    # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)
    trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))
    testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))
    
    lstm_model = get_model()
    lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=50)
    #lstm_model.load_weights('./model_weights/final_lstm.h5')
    y_pred = lstm_model.predict(testDataVecs)
    
    # Save any one of the 8 models.
    if count == 5:
         lstm_model.save('/content/drive/MyDrive/Cicada3301/model_weights/final_lstm.h5')
    
    # Round y_pred to the nearest integer.
    y_pred = np.around(y_pred)
    
    # Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
    result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')
    print("Kappa Score: {}".format(result))
    results.append(result)

    count += 1


--------Fold 1--------

Training Word2Vec Model...




Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 1, 300)            721200    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 814,705
Trainable params: 814,705
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch

In [None]:
print("Average Kappa score after a 5-fold cross validation: ",np.around(np.array(results).mean(),decimals=4))

Average Kappa score after a 5-fold cross validation:  0.959


In [None]:
contentBad = """
        In “Let there be dark,” Paul Bogard talks about the importance of darkness.

Darkness is essential to humans. Bogard states, “Our bodies need darkness to produce the hormone melatonin, which keeps certain cancers from developing, and our bodies need darkness for sleep, sleep. Sleep disorders have been linked to diabetes, obesity, cardiovascular disease and depression and recent research suggests are main cause of “short sleep” is “long light.” Whether we work at night or simply take our tablets, notebooks and smartphones to bed, there isn’t a place for this much artificial light in our lives.” (Bogard 2). Here, Bogard talks about the importance of darkness to humans. Humans need darkness to sleep in order to be healthy.

Animals also need darkness. Bogard states, “The rest of the world depends on darkness as well, including nocturnal and crepuscular species of birds, insects, mammals, fish and reptiles. Some examples are well known—the 400 species of birds that migrate at night in North America, the sea turtles that come ashore to lay their eggs—and some are not, such as the bats that save American farmers billions in pest control and the moths that pollinate 80% of the world’s flora. Ecological light pollution is like the bulldozer of the night, wrecking habitat and disrupting ecosystems several billion years in the making. Simply put, without darkness, Earth’s ecology would collapse...” (Bogard 2). Here Bogard explains that animals, too, need darkness to survive.
    """
contentGood = """
        In response to our world’s growing reliance on artificial light, writer Paul Bogard argues that natural darkness should be preserved in his article “Let There be dark”. He effectively builds his argument by using a personal anecdote, allusions to art and history, and rhetorical questions.

Bogard starts his article off by recounting a personal story – a summer spent on a Minnesota lake where there was “woods so dark that [his] hands disappeared before [his] eyes.” In telling this brief anecdote, Bogard challenges the audience to remember a time where they could fully amass themselves in natural darkness void of artificial light. By drawing in his readers with a personal encounter about night darkness, the author means to establish the potential for beauty, glamour, and awe-inspiring mystery that genuine darkness can possess. He builds his argument for the preservation of natural darkness by reminiscing for his readers a first-hand encounter that proves the “irreplaceable value of darkness.” This anecdote provides a baseline of sorts for readers to find credence with the author’s claims.

Bogard’s argument is also furthered by his use of allusion to art – Van Gogh’s “Starry Night” – and modern history – Paris’ reputation as “The City of Light”. By first referencing “Starry Night”, a painting generally considered to be undoubtedly beautiful, Bogard establishes that the natural magnificence of stars in a dark sky is definite. A world absent of excess artificial light could potentially hold the key to a grand, glorious night sky like Van Gogh’s according to the writer. This urges the readers to weigh the disadvantages of our world consumed by unnatural, vapid lighting. Furthermore, Bogard’s alludes to Paris as “the famed ‘city of light’”. He then goes on to state how Paris has taken steps to exercise more sustainable lighting practices. By doing this, Bogard creates a dichotomy between Paris’ traditionally alluded-to name and the reality of what Paris is becoming – no longer “the city of light”, but moreso “the city of light…before 2 AM”. This furthers his line of argumentation because it shows how steps can be and are being taken to preserve natural darkness. It shows that even a city that is literally famous for being constantly lit can practically address light pollution in a manner that preserves the beauty of both the city itself and the universe as a whole.

Finally, Bogard makes subtle yet efficient use of rhetorical questioning to persuade his audience that natural darkness preservation is essential. He asks the readers to consider “what the vision of the night sky might inspire in each of us, in our children or grandchildren?” in a way that brutally plays to each of our emotions. By asking this question, Bogard draws out heartfelt ponderance from his readers about the affecting power of an untainted night sky. This rhetorical question tugs at the readers’ heartstrings; while the reader may have seen an unobscured night skyline before, the possibility that their child or grandchild will never get the chance sways them to see as Bogard sees. This strategy is definitively an appeal to pathos, forcing the audience to directly face an emotionally-charged inquiry that will surely spur some kind of response. By doing this, Bogard develops his argument, adding gutthral power to the idea that the issue of maintaining natural darkness is relevant and multifaceted.

Writing as a reaction to his disappointment that artificial light has largely permeated the prescence of natural darkness, Paul Bogard argues that we must preserve true, unaffected darkness. He builds this claim by making use of a personal anecdote, allusions, and rhetorical questioning.
    """

In [None]:
 content = contentBad
    
if len(content) > 20:
  num_features = 300
  clean_test_essays = []
  clean_test_essays.append(essay_to_wordlist( content, remove_stopwords=True ))
  testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
  #Error in the above line

  #Just upar wale block me error hai probably, rectify kar dena please

  
  testDataVecs = np.array(testDataVecs)
  testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

  preds = lstm_model.predict(testDataVecs)
  print(preds)

  if math.isnan(preds):
    preds = 0
  else:
    preds = np.round(preds)

  if preds < 0:
    preds = 0
  #else:
  #  preds = 0
        
print(int(preds))

[[5.391217]]
5




In [None]:
from tensorflow import keras
content = contentGood
    
if len(content) > 20:
  num_features = 300
  clean_test_essays = []
  clean_test_essays.append(essay_to_wordlist( content, remove_stopwords=True ))
  testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )
  #Error in the above line

  #Just upar wale block me error hai probably, rectify kar dena please

  
  testDataVecs = np.array(testDataVecs)
  testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

  #preds = lstm_model.predict(testDataVecs)
  model = keras.models.load_model('/content/drive/MyDrive/Cicada3301/model_weights/final_lstm.h5')
  preds = model.predict(testDataVecs)

  print(preds)

  if math.isnan(preds):
    preds = 0
  else:
    preds = np.round(preds)

  if preds < 0:
    preds = 0
  #else:
  #  preds = 0
        
print(int(preds))



[[4.386077]]
4


In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip

--2021-05-29 04:03:47--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 54.144.219.72, 35.170.116.11, 34.205.198.58, ...
Connecting to bin.equinox.io (bin.equinox.io)|54.144.219.72|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13832437 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2021-05-29 04:03:47 (82.9 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [13832437/13832437]



In [None]:
!unzip ngrok-stable-linux-amd64.zip

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
from nltk.corpus import stopwords
from tensorflow import keras
import math
from gensim.test.utils import datapath

from gensim.models import Word2Vec
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model, model_from_config

preds = 0
n = 0

minimum_scores = [-1, 2, 1, 0, 0, 0, 0, 0, 0]
maximum_scores = [-1, 12, 6, 3, 3, 4, 4, 30, 60]

def essay_to_wordlist(essay_v, remove_stopwords):
    """Remove the tagged labels and word tokenize the sentence."""
    essay_v = re.sub("[^a-zA-Z]", " ", essay_v)
    words = essay_v.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return (words)

def essay_to_sentences(essay_v, remove_stopwords):
    """Sentence tokenize the essay and call essay_to_wordlist() for word tokenization."""
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(essay_v.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

def makeFeatureVec(words, model, num_features):
    """Make Feature Vector from the words list of an Essay."""
    featureVec = np.zeros((num_features,),dtype="float32")
    num_words = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            num_words += 1
            featureVec = np.add(featureVec,model[word])        
    featureVec = np.divide(featureVec,num_words)
    return featureVec

def getAvgFeatureVecs(essays, model, num_features):
    """Main function to generate the word vectors for word2vec model."""
    counter = 0
    essayFeatureVecs = np.zeros((len(essays),num_features),dtype="float32")
    for essay in essays:
        essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)
        counter = counter + 1
    return essayFeatureVecs

def u_in():
  num = st.text_input("Enter question number: ")
  text = st.text_input("Enter your essay to be graded: ")

  return text,num

def header(url):
  st.markdown(f'<p style="font-size:50px;border-radius:5%;text-align:center;">{url}</p>', unsafe_allow_html=True)
#  st.markdown("__Auto Essay Grader__")    
def header1(url):
  st.markdown(f'<p style="font-size:26px;border-radius:2%;text-align:center;">{url}</p>', unsafe_allow_html=True)
header("Auto Essay Grader")
#st.subheader("Questions:")
header1("All the Best!!!!!!")
st.title("Questions:")
st.write("1. Zoos have been amusing and educating humans about animals for centuries. Although, containment of animals in zoos is an increasingly controversial topic. Some argue that confining animals to their cages in zoos are both necessary and healthy, while others refute this. Both sides of this debate will be analysed in this essay before a reasoned conclusion is drawn. Drop your view on this.")
st.write("2. If you had the opportunity to bring any person — past or present, fictional or nonfictional — to a place that is special to you (your hometown or country, a favourite location, etc.), who would you bring and why? Tell us what you would share with that person. ")
st.write("3. The world is becoming more and more globalized which can be considered a very positive thing on the overall but it is important to note that this globalization brings with itself the probability of more and more disagreements and difference in perspectives. Since the invention of nuclear weapons, we have had a long period of GLOBAL peace and stability. Are nuclear weapons global peacemakers or killing devices? ")
st.write("4. After 23 films in the franchise, Marvel has earned the right to take a huge risk for. What they gave the audience was a profound and impactful case study in the grieving process. If you are an avenger fan and would like to fill some of your thoughts into this context then take a chance to fill below. ")
st.write("5. Pretend you woke up one day and there is no Covid. People are moving around freely without mask. You see them going on trips. You plan for trips. Explain your thoughts on this imagination of every citizen of what the world would be like. Use your imagination!")
st.write("6. Since our nation's founding, the government -- colonial, federal, and state has punished a varying percentage of arbitrarily-selected murders with the ultimate sanction: death. Sentencing remained to be like tit-for-tat. Explain you views on if the death penalty is effective or not?")
st.write("7. Imagine that your teacher wants to teach a new subject for the next few weeks. Your teacher will take suggestions, and then let the students vote on the new subject. What subject you as a student choose? Write an essay to support your choice and to persuade the other students to vote for your choice.")
st.write("8. The place of black hole in space is such that the gravity pulls a body in such a case that even light cannot pull out with a great amount of matter packed in a small area. You being a space travel enthusiast, write an essay of you travelling to the black hole.")

df,n = u_in()
content = df

model = Word2Vec.load("/content/drive/MyDrive/Cicada3301/model_weights/word2vec.model")

lstm_model = keras.models.load_model('/content/drive/MyDrive/Cicada3301/model_weights/final_lstm.h5')

if len(content) > 20:
  num_features = 300
  clean_test_essays = []
  clean_test_essays.append(essay_to_wordlist( content, remove_stopwords=True ))
  testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )

  
  testDataVecs = np.array(testDataVecs)
  testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))

  preds = lstm_model.predict(testDataVecs)
  


  if math.isnan(preds):
    preds = 0
  else:
    preds = np.round(preds)

  if preds < 0:
    preds = 0
  #else:
  #  preds = 0
if n is not 0:  
  st.write("Question no. : " + n)
  st.write("Final grade is " + str(int(preds)))


Overwriting app.py


In [None]:
get_ipython().system_raw('./ngrok http 8501 &')


In [None]:
!curl -s http://localhost:4040/api/tunnels | python3 -c \
    'import sys, json; print("Execute the next cell and the go to the following URL: " +json.load(sys.stdin)["tunnels"][0]["public_url"])'

Execute the next cell and the go to the following URL: https://b965d6142bdb.ngrok.io


In [None]:
!streamlit run app.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.2:8501[0m
[34m  External URL: [0m[1mhttp://35.221.61.145:8501[0m
[0m
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2021-05-29 05:08:18.355750: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-05-29 05:08:19.487 'pattern' package not found; tag filters are not available for English
2021-05-29 05:08:19.490 adding document #0 to Dictionary(0 unique tokens: [])
2021-05-29 05:08:19.490 built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2021-05-29 05:08:19.940 loading Word2Vec object from /content/drive/MyDrive/Cicada3301/mo

In [None]:
!pip install streamlit

Collecting streamlit
[?25l  Downloading https://files.pythonhosted.org/packages/04/11/57097e14f72a2d1b2a1bbe86c2a8bc375661bfd5c30b5e8cee7c2fad9a44/streamlit-0.82.0-py2.py3-none-any.whl (8.2MB)
[K     |████████████████████████████████| 8.2MB 15.3MB/s 
Collecting pydeck>=0.1.dev5
[?25l  Downloading https://files.pythonhosted.org/packages/d6/bc/f0e44828e4290367c869591d50d3671a4d0ee94926da6cb734b7b200308c/pydeck-0.6.2-py2.py3-none-any.whl (4.2MB)
[K     |████████████████████████████████| 4.2MB 50.2MB/s 
[?25hCollecting blinker
[?25l  Downloading https://files.pythonhosted.org/packages/1b/51/e2a9f3b757eb802f61dc1f2b09c8c99f6eb01cf06416c0671253536517b6/blinker-1.4.tar.gz (111kB)
[K     |████████████████████████████████| 112kB 55.7MB/s 
Collecting watchdog; platform_system != "Darwin"
[?25l  Downloading https://files.pythonhosted.org/packages/f2/5b/36b3b11e557830de6fc1dc06e9aa3ee274119b8cea9cc98175dbbf72cf87/watchdog-2.1.2-py3-none-manylinux2014_x86_64.whl (74kB)
[K     |████████████