In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, fbeta_score, recall_score

%matplotlib inline

### The following code block creates a dataset of students who received grades for all 3 questions in the dataset

In [2]:
df = pd.read_excel('litmus_results_trial1.xlsx')
q1_score = df[pd.notnull(df.question_id_11459_score)]
q1q2 = q1_score[pd.notnull(q1_score.question_id_11460_score)]
q1q2q3 = q1q2[pd.notnull(q1q2.question_id_11461_score)]

In [3]:
"Remove any null values"
q1_answers = q1q2q3.question_id_11459_answer
q1_answers = q1_answers.apply(lambda x: x if pd.notnull(x) else "")

q2_answers = q1q2q3.question_id_11460_answer
q2_answers = q2_answers.apply(lambda x: x if pd.notnull(x) else "")

q3_answers = q1q2q3.question_id_11461_answer
q3_answers = q3_answers.apply(lambda x: x if pd.notnull(x) else "")

In [4]:
"Turn the student answers into a list"
q1_answers = q1_answers.tolist()
q2_answers = q2_answers.tolist()
q3_answers = q3_answers.tolist()

In [5]:
"Grab the solutions from the rubric for each question"
solution_q1 = 'arent you a little too old to be playing make believe'
solution_q2 = 'Monica smiles'
solution_q3 = 'Monica’s response changed because of what happened when she' \
+ 'went blueberry picking with her mother. She started having fun when she was' \
+ 'singing and dancing under the branches. So she understood why her mom sang' \
+ 'and danced for fun at the end of the story. Also, Monica got stung by a bee,' \
+ 'and her mom took care of her by hugging her and giving her first aid cream.' \
+ 'So at the end of the story, she wasn’t annoyed at her mom anymore.'

### The following section's goal is to determine how well a model can predict whether a student received a perfect score or not. This is because the rubric provides the most clear answers for a perfect score

In [6]:
"Append each solution to its corresponding list"
q1_answers.append(solution_q1)
q2_answers.append(solution_q2)
q3_answers.append(solution_q3)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Remove punctuation and make all sentences lowercase

In [8]:
lc_q1_answers = []
lc_q2_answers = []
lc_q3_answers = []
for i in range(len(q1_answers)):
    lc_q1_answers.append(q1_answers[i].lower())
    lc_q2_answers.append(q2_answers[i].lower())
    lc_q3_answers.append(q3_answers[i].lower())
    
np_q1_answers = []
np_q2_answers = []
np_q3_answers = []
import string

for i in lc_q1_answers:
    np_q1_answers.append(i.translate(str.maketrans('', '', string.punctuation)))
for i in lc_q2_answers:
    np_q2_answers.append(i.translate(str.maketrans('', '', string.punctuation)))
for i in lc_q3_answers:
    np_q3_answers.append(i.translate(str.maketrans('', '', string.punctuation)))

#### Implement a cosine similarity score to use in the model

In [9]:
vect = TfidfVectorizer()

In [10]:
tfidf1 = vect.fit_transform(np_q1_answers)
tfidf2 = vect.fit_transform(np_q2_answers)
tfidf3 = vect.fit_transform(np_q3_answers)

#### The last row in the matrix tfidf matrix which is '243' represents how similar each student's answer was to the rubric solution. We will append these results to the dataset to create a variable we can use for the model

In [11]:
sim_score_q1 = list((tfidf1 * tfidf1.T).A[243])
sim_score_q2 = list((tfidf2 * tfidf2.T).A[243])
sim_score_q3 = list((tfidf3 * tfidf3.T).A[243])

In [12]:
q1q2q3["id_11459_sim_score"] = sim_score_q1[:-1]
q1q2q3["id_11460_sim_score"] = sim_score_q2[:-1]
q1q2q3["id_11461_sim_score"] = sim_score_q3[:-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
q1q2q3['q1_hundred'] = q1q2q3.question_id_11459_score.apply(lambda x: 1 if x >= 1.0 else 0)
q1q2q3['q2_hundred'] = q1q2q3.question_id_11460_score.apply(lambda x: 1 if x >= 1.0 else 0)
q1q2q3['q3_hundred'] = q1q2q3.question_id_11461_score.apply(lambda x: 1 if x >= 1.0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
X = q1q2q3[["id_11459_sim_score"]]
y = q1q2q3[["q1_hundred"]]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

#### I will implement a support vector machine model. The reason is because they are well known for being able to determine boundaries in high dimensional space

In [16]:
model = SVC()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
predictions_test = model.predict(X_test)

In [18]:
accuracy_score(y_test, predictions_test)

0.79452054794520544

In [19]:
X = q1q2q3[["id_11460_sim_score"]]
y = q1q2q3[["q2_hundred"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

model = SVC()
model.fit(X_train, y_train)

predictions_test = model.predict(X_test)
accuracy_score(y_test, predictions_test)

  y = column_or_1d(y, warn=True)


0.69863013698630139

In [20]:
X = q1q2q3[["id_11461_sim_score"]]
y = q1q2q3[["q3_hundred"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)


model = SVC()
model.fit(X_train, y_train)

predictions_test = model.predict(X_test)
accuracy_score(y_test, predictions_test)

  y = column_or_1d(y, warn=True)


0.50684931506849318

### The model does very well in predicting whether a student received a perfect score or not for question 11459 but performs poorly for 11460 and 11461. This is most likely due to how short the solution provided for question 11460 is. Also question 11461 has a very long response as a solution and the rubric states that the correct answer may vary.

### Next I will determine how well a model can predict any score on question 11461.

q2 is correct when a student answers 'monica smiles'. also make past tene = present tense

In [21]:
q1_75_answer = "We’re WHAT? What am I? Slave labor?"

q1_50_answer = "she says she thinks her Mom is being childish"

solution_q1 = 'arent you a little too old to be playing make believe'

In [22]:
q1_answers = q1q2q3.question_id_11459_answer
q1_answers = q1_answers.apply(lambda x: x if pd.notnull(x) else "")
q1_answers = q1_answers.tolist()

In [23]:
q1_answers.append(q1_75_answer)

In [24]:
lc_q1_answers = []
np_q1_answers = []
for i in range(len(q1_answers)):
    lc_q1_answers.append(q1_answers[i].lower())
    np_q1_answers.append(lc_q1_answers[i].translate(str.maketrans('', '', string.punctuation)))

In [25]:
tfidf1 = vect.fit_transform(np_q1_answers)
sim_score_q1 = list((tfidf1 * tfidf1.T).A[243])
q1q2q3["id_11459_sim_score_75"] = sim_score_q1[:-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [48]:
q1_answers = q1q2q3.question_id_11459_answer
q1_answers = q1_answers.apply(lambda x: x if pd.notnull(x) else "")
q1_answers = q1_answers.tolist()

q1_answers.append(q1_50_answer)

lc_q1_answers = []
np_q1_answers = []
for i in range(len(q1_answers)):
    lc_q1_answers.append(q1_answers[i].lower())
    np_q1_answers.append(lc_q1_answers[i].translate(str.maketrans('-', ' ', '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')))
    
tfidf1 = vect.fit_transform(np_q1_answers)
sim_score_q1 = list((tfidf1 * tfidf1.T).A[243])
q1q2q3["id_11459_sim_score_50"] = sim_score_q1[:-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


#### This function is needed convert the scores into a categorical variable that we can predict

In [27]:
def convert(x):
    if x == 1:
        return 3
    if x == .75:
        return 2
    if x == .5:
        return 1
    else:
        return 0

In [57]:
q1q2q3['q1_scores'] = q1q2q3.question_id_11459_score.apply(convert)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [58]:
X = q1q2q3[["id_11459_sim_score", "id_11459_sim_score_75", "id_11459_sim_score_50"]]
y = q1q2q3[["q1_scores"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [59]:
model = SVC()
model.fit(X_train, y_train)

predictions_test = model.predict(X_test)
accuracy_score(y_test, predictions_test)

  y = column_or_1d(y, warn=True)


0.83561643835616439

### For question id 11459, the SVC model has a 80% chance of determining the correct answer of a student receiving a grade of 1, .75, .5, or 0. This is without fixing mispelled words or slight changes in the word used such as smiling vs smiles.

### The next step I will implement is correcting spelling errors and seeing if this improves the model.

In [31]:
import autocorrect

In [32]:
autocorrect.spell("thot")

'that'

#### Unfortantely autocorrect has trouble correct the spelling of many words. For example, it does not correct belive to believe which is a common mistake in the student answers. Since the only other packages I saw 'Jamspell' and 'pyenchant' require a unix like distribution I will attempt to build my own

In [49]:
np_q1_answers

['monicas mom askes her if she like movies and monica said yes monica liked the actors and dressing up and pretending to be someone else and she also liked rock',
 'she ask aren’t you a little old to be playing make believe',
 'she said she was holdiding a mop',
 'well monica says that shes is too old for makebelieve but her mom says that your never too old for make believe ',
 'why couldnt they just go to the store and buy some blueberries',
 'she saidhey mom you like movies right',
 'that she is to old for it',
 'when monica sees her mom singing and dancing to a west side story song she said arent you a little to old to be playing make believe',
 'monica said arent you a little old for make believe',
 'who was the one danceing',
 'monica says arent you a little to old to be playing make believe',
 'your to old to be purtending mam',
 'she asks if her moms not a little too old to make believe',
 'arent you to old to play pretend ',
 'you are to old for make bleave',
 '¨ aren´t you a l

In [36]:
from nltk.corpus.reader import wordnet as wn

In [54]:
wn.Synset?

In [38]:
"Peter Norvig's spelling corrector"
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [50]:
correction('belive')

'believe'

In [45]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [55]:
from pattern.en import spelling

ModuleNotFoundError: No module named 'pattern'

In [69]:
q1_sentences = []
for sentence in np_q1_answers:
    word_list = sentence.split()
    sent = ''
    for word in word_list:
        if word in WORDS.keys():
            sent += word + ' '
        else:
            sent += correction(word) + ' '
    sent = sent.strip()
    q1_sentences.append(sent)
            
            

In [71]:
np_q1_answers

['monicas mom askes her if she like movies and monica said yes monica liked the actors and dressing up and pretending to be someone else and she also liked rock',
 'she ask aren’t you a little old to be playing make believe',
 'she said she was holdiding a mop',
 'well monica says that shes is too old for makebelieve but her mom says that your never too old for make believe ',
 'why couldnt they just go to the store and buy some blueberries',
 'she saidhey mom you like movies right',
 'that she is to old for it',
 'when monica sees her mom singing and dancing to a west side story song she said arent you a little to old to be playing make believe',
 'monica said arent you a little old for make believe',
 'who was the one danceing',
 'monica says arent you a little to old to be playing make believe',
 'your to old to be purtending mam',
 'she asks if her moms not a little too old to make believe',
 'arent you to old to play pretend ',
 'you are to old for make bleave',
 '¨ aren´t you a l

In [70]:
q1_sentences

['monica mon asked her if she like moves and monica said yes monica liked the actors and dressing up and pretending to be someone else and she also liked rock',
 'she ask agent you a little old to be playing make believe',
 'she said she was holding a mop',
 'well monica says that she is too old for makebelieve but her mon says that your never too old for make believe',
 'why couldn they just go to the store and buy some blueberries',
 'she sidney mon you like moves right',
 'that she is to old for it',
 'when monica sees her mon singing and dancing to a west side story song she said agent you a little to old to be playing make believe',
 'monica said agent you a little old for make believe',
 'who was the one dancing',
 'monica says agent you a little to old to be playing make believe',
 'your to old to be pretending may',
 'she asks if her mobs not a little too old to make believe',
 'agent you to old to play pretend',
 'you are to old for make leave',
 'a agent you a little to old t

In [73]:
'belive' in WORDS.keys()

False

In [74]:
correction('belive')

'believe'