## Importing the packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from copy import deepcopy
import re

In [2]:
import warnings
warnings.simplefilter('ignore')

## Loading the dataset 

In [3]:
essays=pd.read_excel('training_set_rel3.xlsx',usecols=['essay_set','essay','rater1_domain1','rater2_domain1','domain1_score'])

we take only the relevant columns

In [4]:
essays.head()

Unnamed: 0,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0
1,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0
2,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0
3,1,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0
4,1,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0


In [5]:
essays.shape

(12978, 5)

In [6]:
essays['essay'][0]

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

In [7]:
essays.isnull().sum()

essay_set         0
essay             0
rater1_domain1    1
rater2_domain1    1
domain1_score     1
dtype: int64

In [8]:
## worst case if we consider all three nulls to be present in diff rows, still the null values comprise of very little fraction

print((3/len(essays))*100)

0.02311604253351826


In [9]:
## 0.02 percent, so it will be safe to drop na
print("length before ",len(essays))
essays.dropna(inplace=True)
print("length after ",len(essays))

length before  12978
length after  12977


In [10]:
### there was just a single row
##Data lost in the process 
print((1/len(essays))*100)

0.00770594128072744


In [11]:
### Duplicates
print("length before ",len(essays))
essays.drop_duplicates(inplace=True)
print("length after ",len(essays))


length before  12977
length after  12976


there was just one duplicate value

### Now we try to clean the essay column(The meat of our dataset)

Instead of making changes in the original df, make a copy of it and clean it.

Deep copy keeps no reference to the original one, meaning, changes in the copy are not refelcted in the original one.

In [12]:
essays2 = deepcopy(essays)

In [13]:
essays2.head()

Unnamed: 0,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score
0,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0
1,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,9.0
2,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,7.0
3,1,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,10.0
4,1,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,8.0


1. grammatical errors

2. word count

3. no. of sentences

4. no. of paragraphs

6. length of sentences

7. length of paragraphs

8. special symbols

9. spelling mistakes

10. no. of nouns

11. adjectives

12. quotations


**Removing special symbols.**

Tried using regex, wasn't working :(

In [19]:
spec_char = ['@', '!', '#', '$', '%', '&', '*', '^', '?', '~', ',', '.' ]
# add more symbols to the list if you'd like to
for symbol in spec_char:
    essays2['essay'] = essays2['essay'].str.replace(symbol, '')
    
essays2['essay'] = essays2['essay'].str.split('  ').str.join(' ')

#removing special symbols -> not working
#essays2['essay'] = essays2['essay'].apply(lambda x:re.sub('@!#$%&^?,.+', '',x))
#removing digits
essays2['essay'] = essays2['essay'].apply(lambda x:re.sub('[\d]', '', x))

In [20]:
essays2['essay'][0]

"dear local newspaper i think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people helps us learn about the globe(astronomy) and keeps us out of troble thing about dont you think so how would you feel if your teenager is always on the phone with friends do you ever time to chat with your friends or buisness partner about things well now - there's a new way to chat the computer theirs plenty of sites on the internet to do so: organization organization caps facebook myspace ect just think now while your setting up meeting with your boss on the computer your teenager is having fun on the phone not rushing to get off cause you want to use it how did you learn about other countrys/states outside of yours well i have by computer/internet it's a new way to learn about what going on in our time you might think your child spends a lot of time on the computer but ask them so question about the economy sea floor spreading or 

**Converting to lowercase.**

In [21]:
essays2['essay'] = essays2['essay'].str.lower()

In [22]:
essays2['essay'][0]

"dear local newspaper i think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people helps us learn about the globe(astronomy) and keeps us out of troble thing about dont you think so how would you feel if your teenager is always on the phone with friends do you ever time to chat with your friends or buisness partner about things well now - there's a new way to chat the computer theirs plenty of sites on the internet to do so: organization organization caps facebook myspace ect just think now while your setting up meeting with your boss on the computer your teenager is having fun on the phone not rushing to get off cause you want to use it how did you learn about other countrys/states outside of yours well i have by computer/internet it's a new way to learn about what going on in our time you might think your child spends a lot of time on the computer but ask them so question about the economy sea floor spreading or 

In [23]:
essays2['essay'][16]

"dear local newspaper i belive that computers have a negative effect on peoples lives i belive this because who spend to much time on the computer don't get out as much as they should don't spend enough time with their family and the computer can't do everything my first reason is i belive that people need to get out more when they don't get out they don't exersise and that is very unhealthy instead of watching the games or the scores they should get out and play the game i also belive that they should enjoy nature because i feel like they are wasting the beauty of nature all around them we wouldn't want to waste our abilities and privalges would we another reason is that they do not spend enough time with family if you have family near you then you should take advantage of that and interact with one another you can have fun with your family by playing games you can also have fun by just hanging out which boost your social skills and the computer can't always do that which brings me to