In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
import pickle

from tqdm import tqdm
import os

# Reading Data

In [3]:
project_data = pd.read_csv('train_data.csv', nrows=5000)
resource_data = pd.read_csv('resources.csv')

In [4]:
print("Number of data points in train data", project_data.shape)
print('-'*100)
print("The attributes of data :", project_data.columns.values)

Number of data points in train data (5000, 17)
----------------------------------------------------------------------------------------------------
The attributes of data : ['Unnamed: 0' 'id' 'teacher_id' 'teacher_prefix' 'school_state'
 'project_submitted_datetime' 'project_grade_category'
 'project_subject_categories' 'project_subject_subcategories'
 'project_title' 'project_essay_1' 'project_essay_2' 'project_essay_3'
 'project_essay_4' 'project_resource_summary'
 'teacher_number_of_previously_posted_projects' 'project_is_approved']


In [6]:
print("Number of data points in train data", resource_data.shape)
print('-'*100)
print(resource_data.columns.values)
resource_data.head(2)

Number of data points in train data (1541272, 4)
----------------------------------------------------------------------------------------------------
['id' 'description' 'quantity' 'price']


Unnamed: 0,id,description,quantity,price
0,p233245,LC652 - Lakeshore Double-Space Mobile Drying Rack,1,149.0
1,p069063,Bouncy Bands for Desks (Blue support pipes),3,14.95


## __Preprocessing Categorical Features: project_grade_category__

In [7]:
project_data['project_grade_category'].value_counts()

Grades PreK-2    2002
Grades 3-5       1729
Grades 6-8        785
Grades 9-12       484
Name: project_grade_category, dtype: int64

- we need to remove the spaces, replace the '-' with '_' and convert all the letters to small

In [8]:
# https://stackoverflow.com/questions/36383821/pandas-dataframe-apply-function-to-column-strings-based-on-other-column-value
project_data['project_grade_category'] = project_data['project_grade_category'].str.replace(' ','_')
project_data['project_grade_category'] = project_data['project_grade_category'].str.replace('-','_')
project_data['project_grade_category'] = project_data['project_grade_category'].str.lower()
project_data['project_grade_category'].value_counts()

grades_prek_2    2002
grades_3_5       1729
grades_6_8        785
grades_9_12       484
Name: project_grade_category, dtype: int64

## __Preprocessing Categorical Features: project_subject_categories__

In [9]:
project_data['project_subject_categories'].value_counts()

Literacy & Language                           1067
Math & Science                                 795
Literacy & Language, Math & Science            679
Health & Sports                                509
Music & The Arts                               233
Literacy & Language, Special Needs             207
Applied Learning                               164
Special Needs                                  162
Math & Science, Literacy & Language            101
Applied Learning, Literacy & Language           97
Math & Science, Special Needs                   80
Applied Learning, Special Needs                 80
Literacy & Language, Music & The Arts           79
Math & Science, Music & The Arts                76
History & Civics, Literacy & Language           65
History & Civics                                63
Health & Sports, Special Needs                  57
Warmth, Care & Hunger                           53
Math & Science, Applied Learning                52
Applied Learning, Math & Scienc

In [10]:
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.replace(' The ','')
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.replace(' ','')
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.replace('&','_')
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.replace(',','_')
project_data['project_subject_categories'] = project_data['project_subject_categories'].str.lower()
project_data['project_subject_categories'].value_counts()

literacy_language                       1067
math_science                             795
literacy_language_math_science           679
health_sports                            509
music_arts                               233
literacy_language_specialneeds           207
appliedlearning                          164
specialneeds                             162
math_science_literacy_language           101
appliedlearning_literacy_language         97
math_science_specialneeds                 80
appliedlearning_specialneeds              80
literacy_language_music_arts              79
math_science_music_arts                   76
history_civics_literacy_language          65
history_civics                            63
health_sports_specialneeds                57
warmth_care_hunger                        53
math_science_appliedlearning              52
appliedlearning_math_science              44
literacy_language_history_civics          40
health_sports_literacy_language           40
appliedlea

## __Preprocessing Categorical Features: teacher_prefix__

In [11]:
project_data['teacher_prefix'].value_counts()

Mrs.       2560
Ms.        1845
Mr.         495
Teacher     100
Name: teacher_prefix, dtype: int64

In [12]:
# check if we have any nan values are there
print(project_data['teacher_prefix'].isnull().values.any())
print("number of nan values",project_data['teacher_prefix'].isnull().values.sum())

False
number of nan values 0


In [13]:
project_data['teacher_prefix'] = project_data['teacher_prefix'].str.replace('.','')
project_data['teacher_prefix'] = project_data['teacher_prefix'].str.lower()
project_data['teacher_prefix'].value_counts()

mrs        2560
ms         1845
mr          495
teacher     100
Name: teacher_prefix, dtype: int64

## __Preprocessing Categorical Features: project_subject_subcategories__

In [14]:
project_data['project_subject_subcategories'].value_counts()

Literacy                                      449
Literacy, Mathematics                         368
Literature & Writing, Mathematics             293
Literacy, Literature & Writing                234
Mathematics                                   232
                                             ... 
Extracurricular, Literature & Writing           1
Community Service, Gym & Fitness                1
Community Service, Literature & Writing         1
Civics & Government, Extracurricular            1
Character Education, Warmth, Care & Hunger      1
Name: project_subject_subcategories, Length: 248, dtype: int64

In [15]:
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.replace(' The ','')
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.replace(' ','')
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.replace('&','_')
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.replace(',','_')
project_data['project_subject_subcategories'] = project_data['project_subject_subcategories'].str.lower()
project_data['project_subject_subcategories'].value_counts()

literacy                                 449
literacy_mathematics                     368
literature_writing_mathematics           293
literacy_literature_writing              234
mathematics                              232
                                        ... 
extracurricular_literature_writing         1
communityservice_gym_fitness               1
communityservice_literature_writing        1
civics_government_extracurricular          1
charactereducation_warmth_care_hunger      1
Name: project_subject_subcategories, Length: 248, dtype: int64

## __Preprocessing Categorical Features: school_state__

In [16]:
project_data['school_state'].value_counts()

CA    707
TX    352
NY    342
FL    261
NC    246
SC    191
IL    184
GA    164
PA    151
MI    151
OH    122
OK    120
MO    117
MA    115
LA    114
IN    113
NJ     99
AZ     99
WA     97
VA     90
TN     85
AL     84
CT     78
UT     76
WI     72
MD     69
OR     64
KY     59
CO     58
NV     53
AR     52
MN     48
MS     46
KS     30
ID     29
IA     29
HI     28
DC     23
NM     22
WV     21
ME     19
NE     19
AK     17
SD     16
RI     16
NH     13
ND     11
DE     11
WY      9
MT      6
VT      2
Name: school_state, dtype: int64

In [17]:
project_data['school_state'] = project_data['school_state'].str.lower()
project_data['school_state'].value_counts()

ca    707
tx    352
ny    342
fl    261
nc    246
sc    191
il    184
ga    164
pa    151
mi    151
oh    122
ok    120
mo    117
ma    115
la    114
in    113
nj     99
az     99
wa     97
va     90
tn     85
al     84
ct     78
ut     76
wi     72
md     69
or     64
ky     59
co     58
nv     53
ar     52
mn     48
ms     46
ks     30
id     29
ia     29
hi     28
dc     23
nm     22
wv     21
me     19
ne     19
ak     17
sd     16
ri     16
nh     13
nd     11
de     11
wy      9
mt      6
vt      2
Name: school_state, dtype: int64

## __Preprocessing Categorical Features: project_title__

In [18]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [19]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [20]:
# Combining all the above stundents 
from tqdm import tqdm
def preprocess_text(text_data):
    preprocessed_text = []
    # tqdm is for printing the status bar
    for sentance in tqdm(text_data):
        sent = decontracted(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\n', ' ')
        sent = sent.replace('\\"', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        # https://gist.github.com/sebleier/554280
        sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

In [21]:
preprocessed_titles = preprocess_text(project_data['project_title'].values)

100%|██████████| 5000/5000 [00:00<00:00, 42038.72it/s]


In [22]:
print("printing some random reviews")
print(9, preprocessed_titles[9])
print(34, preprocessed_titles[34])
print(147, preprocessed_titles[147])

printing some random reviews
9 love reading pure pleasure
34 ball
147 needs chromebook


## __Preprocessing Categorical Features: essay__

In [23]:
# merge two column text dataframe: 
project_data["essay"] = project_data["project_essay_1"].map(str) +\
                        project_data["project_essay_2"].map(str) + \
                        project_data["project_essay_3"].map(str) + \
                        project_data["project_essay_4"].map(str)

In [24]:
print("printing some random essay")
print(9, project_data['essay'].values[9])
print('-'*50)
print(34, project_data['essay'].values[34])
print('-'*50)
print(147, project_data['essay'].values[147])

printing some random essay
9 Over 95% of my students are on free or reduced lunch.  I have a few who are homeless, but despite that, they come to school with an eagerness to learn.  My students are inquisitive eager learners who  embrace the challenge of not having great books and other resources  every day.  Many of them are not afforded the opportunity to engage with these big colorful pages of a book on a regular basis at home and they don't travel to the public library.  \r\nIt is my duty as a teacher to do all I can to provide each student an opportunity to succeed in every aspect of life. \r\nReading is Fundamental! My students will read these books over and over again while boosting their comprehension skills. These books will be used for read alouds, partner reading and for Independent reading. \r\nThey will engage in reading to build their \"Love for Reading\" by reading for pure enjoyment. They will be introduced to some new authors as well as some old favorites. I want my st

In [25]:
preprocessed_essays = preprocess_text(project_data['essay'].values)

100%|██████████| 5000/5000 [00:02<00:00, 1995.76it/s]


In [26]:
print("printing some random essay")
print(9, preprocessed_essays[9])
print('-'*50)
print(34, preprocessed_essays[34])
print('-'*50)
print(147, preprocessed_essays[147])

printing some random essay
9 95 students free reduced lunch homeless despite come school eagerness learn students inquisitive eager learners embrace challenge not great books resources every day many not afforded opportunity engage big colorful pages book regular basis home not travel public library duty teacher provide student opportunity succeed every aspect life reading fundamental students read books boosting comprehension skills books used read alouds partner reading independent reading engage reading build love reading reading pure enjoyment introduced new authors well old favorites want students ready 21st century know pleasure holding good hard back book hand nothing like good book read students soar reading consideration generous funding contribution help build stamina prepare 3rd grade thank much reading proposal nannan
--------------------------------------------------
34 students mainly come extremely low income families majority come homes parents work full time students s

## __Preprocessing Numerical Values: price__

In [27]:
# https://stackoverflow.com/questions/22407798/how-to-reset-a-dataframes-indexes-for-all-groups-in-one-step
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
price_data.head(2)

Unnamed: 0,id,price,quantity
0,p000001,459.56,7
1,p000002,515.89,21


In [28]:
# join two dataframes in python: 
project_data = pd.merge(project_data, price_data, on='id', how='left')

## Applying StandardScaler

In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(project_data['price'].values.reshape(-1, 1))
project_data['std_price']=scaler.transform(project_data['price'].values.reshape(-1, 1) )

## Applying MinMaxScaler

In [30]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(project_data['price'].values.reshape(-1, 1))
project_data['nrm_price']=scaler.transform(project_data['price'].values.reshape(-1, 1))