In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm_notebook
from sklearn.preprocessing import Normalizer
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
import pickle
from scipy import sparse
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from prettytable import PrettyTable

In [2]:
# sampling 100000 data points
project_data = pd.read_csv('train_data.csv').sample(n=25000)
resource_data = pd.read_csv('resources.csv')

price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
project_data = pd.merge(project_data, price_data, on='id', how='left')

In [3]:
y = project_data['project_is_approved'].values
X = project_data.drop(['project_is_approved'], axis=1)

In [4]:
# train test cv split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(20000, 18) (20000,)
(5000, 18) (5000,)


In [5]:
def decontracted(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [6]:
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [7]:
def preprocess_text(text):
    preprocessed_text = []
    for sent in text:
        sent = decontracted(sent)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\n', ' ')
        sent = sent.replace('\\"', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

In [8]:
feature_list_bow = []
feature_list_tfidf = []

In [9]:
def get_response_table(X_train, y_train, X_test, col):
    lst = []
    categories = X_train[col].unique()
    for i in categories:
        tot = X_train[(X_train[col]==i)][col].count()
        cat_0 = (X_train[(X_train[col]==i) & (y_train==0)][col].count())/tot
        cat_1 = (X_train[(X_train[col]==i) & (y_train==1)][col].count())/tot
        lst.append([i, cat_0, cat_1])
    response_table = pd.DataFrame(lst, columns = ['categories', 'cat_0', 'cat_1'])
    response_table.set_index('categories', inplace=True)
    
    df_train = pd.DataFrame()
    df_train['cat_0'] = list(map(lambda cat: response_table.loc[cat]['cat_0'], X_train[col].values))
    df_train['cat_1'] = list(map(lambda cat: response_table.loc[cat]['cat_1'], X_train[col].values))
    
    df_test = pd.DataFrame()
    df_test['cat_0'] = list(map(lambda cat: response_table.loc[cat]['cat_0'] if(cat in categories) else 0.5 , X_test[col].values))
    df_test['cat_1'] = list(map(lambda cat: response_table.loc[cat]['cat_1'] if(cat in categories) else 0.5 , X_test[col].values))
    
    return response_table, df_train, df_test

# 1. teacher_prefix

In [10]:
def preprocess_teacher_prefix(df):
    df['teacher_prefix'] = df['teacher_prefix'].fillna('Mrs.')
    df['teacher_prefix'] = df['teacher_prefix'].str.replace('.', '')
    df['teacher_prefix'] = df['teacher_prefix'].str.lower()

preprocess_teacher_prefix(X_train)
preprocess_teacher_prefix(X_test)

vectorizer = CountVectorizer()
vectorizer.fit(X_train['teacher_prefix'].values)

X_train_teacher_ohe = vectorizer.transform(X_train['teacher_prefix'].values)
X_test_teacher_ohe = vectorizer.transform(X_test['teacher_prefix'].values)

feature_list_bow.extend(vectorizer.get_feature_names())
feature_list_tfidf.extend(vectorizer.get_feature_names())

print(X_train_teacher_ohe.shape, y_train.shape)
print(X_test_teacher_ohe.shape, y_test.shape)

(20000, 5) (20000,)
(5000, 5) (5000,)


In [11]:
teacher_prefix_rt, teacher_prefix_train_df, teacher_prefix_test_df = get_response_table(X_train, y_train, X_test, 'teacher_prefix')

# 2. school_state

In [12]:
def preprocess_school_state(df):
    df['school_state'] = df['school_state'].str.lower()

preprocess_school_state(X_train)
preprocess_school_state(X_test)

vectorizer = CountVectorizer(binary=True)
vectorizer.fit(X_train['school_state'].values)

X_train_state_ohe = vectorizer.transform(X_train['school_state'].values)
X_test_state_ohe = vectorizer.transform(X_test['school_state'].values)

feature_list_bow.extend(vectorizer.get_feature_names())
feature_list_tfidf.extend(vectorizer.get_feature_names())

print(X_train_state_ohe.shape, y_train.shape)
print(X_test_state_ohe.shape, y_test.shape)

(20000, 51) (20000,)
(5000, 51) (5000,)


In [13]:
school_state_rt, school_state_train_df, school_state_test_df = get_response_table(X_train, y_train, X_test,'school_state')

# 3. project_grade_category

In [14]:
def preprocess_project_grade_category(df):
    df['project_grade_category'] = df['project_grade_category'].str.replace(' ', '_')
    df['project_grade_category'] = df['project_grade_category'].str.replace('-', '_')
    df['project_grade_category'] = df['project_grade_category'].str.lower()

preprocess_project_grade_category(X_train)
preprocess_project_grade_category(X_test)

vectorizer = CountVectorizer()
vectorizer.fit(X_train['project_grade_category'].values)

X_train_grade_ohe = vectorizer.transform(X_train['project_grade_category'].values)
X_test_grade_ohe = vectorizer.transform(X_test['project_grade_category'].values)

feature_list_bow.extend(vectorizer.get_feature_names())
feature_list_tfidf.extend(vectorizer.get_feature_names())

print(X_train_grade_ohe.shape, y_train.shape)
print(X_test_grade_ohe.shape, y_test.shape)

(20000, 4) (20000,)
(5000, 4) (5000,)


In [15]:
project_grade_category_rt, project_grade_category_train_df, project_grade_category_test_df = get_response_table(X_train, y_train, X_test,'project_grade_category')

# 4. project_subject_categories

In [16]:
def preprocess_project_subject_categories(df):
    df['project_subject_categories'] = project_data['project_subject_categories'].str.replace(' The', '')
    df['project_subject_categories'] = project_data['project_subject_categories'].str.replace(' ', '')
    df['project_subject_categories'] = project_data['project_subject_categories'].str.replace('&', '_')
    df['project_subject_categories'] = project_data['project_subject_categories'].str.replace(',', '_')
    df['project_subject_categories'] = project_data['project_subject_categories'].str.lower()

preprocess_project_subject_categories(X_train)
preprocess_project_subject_categories(X_test)

vectorizer = CountVectorizer()
vectorizer.fit(X_train['project_subject_categories'].values)

X_train_subject_ohe = vectorizer.transform(X_train['project_subject_categories'].values)
X_test_subject_ohe = vectorizer.transform(X_test['project_subject_categories'].values)

feature_list_bow.extend(vectorizer.get_feature_names())
feature_list_tfidf.extend(vectorizer.get_feature_names())

print(X_train_subject_ohe.shape, y_train.shape)
print(X_test_subject_ohe.shape, y_test.shape)

(20000, 18) (20000,)
(5000, 18) (5000,)


In [17]:
project_subject_categories_rt, project_subject_categories_train_df, project_subject_categories_test_df = get_response_table(X_train, y_train, X_test,'project_subject_categories')

# 5. project_subject_subcategories

In [18]:
def preprocess_project_subject_subcategories(df):
    df['project_subject_subcategories'] = df['project_subject_subcategories'].str.replace(' The', '')
    df['project_subject_subcategories'] = df['project_subject_subcategories'].str.replace(' ','')
    df['project_subject_subcategories'] = df['project_subject_subcategories'].str.replace('&', '_')
    df['project_subject_subcategories'] = df['project_subject_subcategories'].str.replace(',', '_')
    df['project_subject_subcategories'] = df['project_subject_subcategories'].str.lower()

preprocess_project_subject_subcategories(X_train)
preprocess_project_subject_subcategories(X_test)

vectorizer = CountVectorizer()
vectorizer.fit(X_train['project_subject_subcategories'].values)

X_train_subject_subcategories_ohe = vectorizer.transform(X_train['project_subject_subcategories'].values)
X_test_subject_subcategories_ohe = vectorizer.transform(X_test['project_subject_subcategories'].values)

feature_list_bow.extend(vectorizer.get_feature_names())
feature_list_tfidf.extend(vectorizer.get_feature_names())

print(X_train_subject_subcategories_ohe.shape, y_train.shape)
print(X_train_subject_subcategories_ohe.shape, y_test.shape)

(20000, 328) (20000,)
(20000, 328) (5000,)


In [19]:
project_subject_subcategories_rt, project_subject_subcategories_train_df, project_subject_subcategories_test_df = get_response_table(X_train, y_train, X_test,'project_subject_subcategories')

# 6. essay

In [20]:
def preprocess_essays(df):
    df['essay'] = df['project_essay_1'].map(str) +\
                  df['project_essay_2'].map(str) +\
                  df['project_essay_3'].map(str) +\
                  df['project_essay_4'].map(str)

    df['essay'] = preprocess_text(df['essay'].values)
    
preprocess_essays(X_train)
preprocess_essays(X_test)

bow_model = CountVectorizer(min_df=10, max_features=10000)
bow_model.fit(X_train['essay'].values)

X_train_essay_bow = bow_model.transform(X_train['essay'].values)
X_test_essay_bow = bow_model.transform(X_test['essay'].values)

feature_list_bow.extend(bow_model.get_feature_names())

print(X_train_essay_bow.shape, y_train.shape)
print(X_test_essay_bow.shape, y_test.shape)

(20000, 8369) (20000,)
(5000, 8369) (5000,)


In [21]:
tfidf_model = TfidfVectorizer(min_df=10, max_features=5000)
tfidf_model.fit(X_train['essay'].values)

X_train_essay_tfidf = tfidf_model.transform(X_train['essay'].values)
X_test_essay_tfidf = tfidf_model.transform(X_test['essay'].values)

print(X_train_essay_tfidf.shape, y_train.shape)
print(X_test_essay_tfidf.shape, y_test.shape)

(20000, 5000) (20000,)
(5000, 5000) (5000,)


In [22]:
with open('glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words = set(model.keys())

tfidf_model = TfidfVectorizer()
tfidf_model.fit(X_train['essay'])
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

In [23]:
tfidf_w2v_vectors_train = []
for sentence in tqdm_notebook(X_train['essay'].values):
    vector = np.zeros(300)
    tf_idf_weight = 0
    for word in sentence.split():
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word]
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
            vector += (vec * tf_idf)
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_train.append(vector)

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




In [24]:
tfidf_w2v_vectors_test = []
for sentence in tqdm_notebook(X_test['essay'].values):
    vector = np.zeros(300)
    tf_idf_weight = 0
    for word in sentence.split():
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word]
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
            vector += (vec * tf_idf)
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_test.append(vector)

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [25]:
X_train_tfidf_w2v = pd.DataFrame(tfidf_w2v_vectors_train)
X_test_tfidf_w2v = pd.DataFrame(tfidf_w2v_vectors_test)

print(X_train_tfidf_w2v.shape)
print(X_test_tfidf_w2v.shape)

(20000, 300)
(5000, 300)


# 7. price

In [26]:
normalizer = Normalizer()
normalizer.fit(X_train['price'].values.reshape(1,-1))

X_train_price_temp = normalizer.transform(X_train['price'].values.reshape(1,-1))
X_test_price_temp = normalizer.transform(X_test['price'].values.reshape(1,-1))

X_train_price_normalized = X_train_price_temp.reshape(-1,1)
X_test_price_normalized = X_test_price_temp.reshape(-1,1)

feature_list_bow.append('Price')
feature_list_tfidf.append('Price')

print(X_train_price_normalized.shape, y_train.shape)
print(X_test_price_normalized.shape, y_train.shape)

(20000, 1) (20000,)
(5000, 1) (20000,)


# 8. teacher_number_of_previously_posted_projects

In [27]:
normalizer = Normalizer()
normalizer.fit(X_train['teacher_number_of_previously_posted_projects'].values.reshape(1,-1))

X_train_price_temp = normalizer.transform(X_train['teacher_number_of_previously_posted_projects'].values.reshape(1,-1))
X_test_price_temp = normalizer.transform(X_test['teacher_number_of_previously_posted_projects'].values.reshape(1,-1))

X_train_prev_proj_norm = X_train_price_temp.reshape(-1,1)
X_test_prev_proj_norm = X_test_price_temp.reshape(-1,1)

feature_list_bow.append('teacher_number_of_previously_posted_projects')
feature_list_tfidf.append('teacher_number_of_previously_posted_projects')

print(X_train_prev_proj_norm.shape, y_train.shape)
print(X_test_prev_proj_norm.shape, y_train.shape)

(20000, 1) (20000,)
(5000, 1) (20000,)


# 9. sentiment_score

In [28]:
def sentiment_analyzer(col):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(col)
    return list(sentiment.values())

X_train_sentiment_temp = list(X_train['essay'].apply(sentiment_analyzer))
X_test_sentiment_temp = list(X_test['essay'].apply(sentiment_analyzer))

X_train_sentiment = pd.DataFrame(X_train_sentiment_temp, columns=['neg', 'neu', 'pos', 'compound'])
X_test_sentiment = pd.DataFrame(X_test_sentiment_temp, columns=['neg', 'neu', 'pos', 'compound'])

print(X_train_sentiment.shape)
print(X_test_sentiment.shape)

(20000, 4)
(5000, 4)


In [29]:
# BOW without sentiment score
X_train_bow = hstack((X_train_teacher_ohe, \
                      X_train_state_ohe, \
                      X_train_grade_ohe, \
                      X_train_subject_ohe, \
                      X_train_subject_subcategories_ohe, \
                      X_train_essay_bow, \
                      X_train_price_normalized, \
                      X_train_prev_proj_norm)).tocsr()

X_test_bow = hstack((X_test_teacher_ohe, \
                     X_test_state_ohe, \
                     X_test_grade_ohe, \
                     X_test_subject_ohe, \
                     X_test_subject_subcategories_ohe, \
                     X_test_essay_bow, \
                     X_test_price_normalized, \
                     X_test_prev_proj_norm)).tocsr()

print(X_train_bow.shape, y_train.shape)
print(X_test_bow.shape, y_test.shape)

(20000, 8777) (20000,)
(5000, 8777) (5000,)


In [30]:
sparse.save_npz('X_train_bow', X_train_bow)
sparse.save_npz('X_test_bow', X_test_bow)

In [31]:
# TFIDF without sentiment score
X_train_tfidf = hstack((X_train_teacher_ohe, \
                        X_train_state_ohe, \
                        X_train_grade_ohe, \
                        X_train_subject_ohe, \
                        X_train_subject_subcategories_ohe, \
                        X_train_essay_tfidf, \
                        X_train_price_normalized, \
                        X_train_prev_proj_norm)).tocsr()

X_test_tfidf = hstack((X_test_teacher_ohe, \
                        X_test_state_ohe, \
                        X_test_grade_ohe, \
                        X_test_subject_ohe, \
                        X_test_subject_subcategories_ohe, \
                        X_test_essay_tfidf, \
                        X_test_price_normalized, \
                        X_test_prev_proj_norm)).tocsr()

print(X_train_tfidf.shape, y_train.shape)
print(X_test_tfidf.shape, y_test.shape)

(20000, 5408) (20000,)
(5000, 5408) (5000,)


In [32]:
sparse.save_npz('X_train_tfidf', X_train_tfidf)
sparse.save_npz('X_test_tfidf', X_test_tfidf)

In [33]:
# TFIDF with sentiment score
X_train_tfidf_sc = hstack((X_train_teacher_ohe, \
                        X_train_state_ohe, \
                        X_train_grade_ohe, \
                        X_train_subject_ohe, \
                        X_train_subject_subcategories_ohe, \
                        X_train_essay_tfidf, \
                        X_train_price_normalized, \
                        X_train_prev_proj_norm, \
                        X_train_sentiment)).tocsr()

X_test_tfidf_sc = hstack((X_test_teacher_ohe, \
                        X_test_state_ohe, \
                        X_test_grade_ohe, \
                        X_test_subject_ohe, \
                        X_test_subject_subcategories_ohe, \
                        X_test_essay_tfidf, \
                        X_test_price_normalized, \
                        X_test_prev_proj_norm, \
                        X_test_sentiment)).tocsr()

print(X_train_tfidf_sc.shape, y_train.shape)
print(X_test_tfidf_sc.shape, y_test.shape)

(20000, 5412) (20000,)
(5000, 5412) (5000,)


In [34]:
sparse.save_npz('X_train_tfidf_sc', X_train_tfidf_sc)
sparse.save_npz('X_test_tfidf_sc', X_test_tfidf_sc)

In [35]:
# TFIDF W2V with sentiment score
X_train_tfidf_w2v_sc = hstack((X_train_teacher_ohe, \
                        X_train_state_ohe, \
                        X_train_grade_ohe, \
                        X_train_subject_ohe, \
                        X_train_subject_subcategories_ohe, \
                        X_train_tfidf_w2v, \
                        X_train_price_normalized, \
                        X_train_prev_proj_norm, \
                        X_train_sentiment)).tocsr()

X_test_tfidf_w2v_sc = hstack((X_test_teacher_ohe, \
                        X_test_state_ohe, \
                        X_test_grade_ohe, \
                        X_test_subject_ohe, \
                        X_test_subject_subcategories_ohe, \
                        X_test_tfidf_w2v, \
                        X_test_price_normalized, \
                        X_test_prev_proj_norm, \
                        X_test_sentiment)).tocsr()

print(X_train_tfidf_w2v_sc.shape, y_train.shape)
print(X_test_tfidf_w2v_sc.shape, y_test.shape)

(20000, 712) (20000,)
(5000, 712) (5000,)


In [36]:
sparse.save_npz('X_train_tfidf_w2v_sc', X_train_tfidf_sc)
sparse.save_npz('X_test_tfidf_w2v_sc', X_test_tfidf_sc)

In [37]:
# TFIDF Response coded
X_train_tfidf_rc = hstack((teacher_prefix_train_df, \
                        school_state_train_df, \
                        project_grade_category_train_df, \
                        project_subject_categories_train_df, \
                        project_subject_subcategories_train_df, \
                        X_train_essay_tfidf, \
                        X_train_price_normalized, \
                        X_train_prev_proj_norm, \
                        X_train_sentiment)).tocsr()

X_test_tfidf_rc = hstack((teacher_prefix_test_df, \
                        school_state_test_df, \
                        project_grade_category_test_df, \
                        project_subject_categories_test_df, \
                        project_subject_subcategories_test_df, \
                        X_test_essay_tfidf, \
                        X_test_price_normalized, \
                        X_test_prev_proj_norm, \
                        X_test_sentiment)).tocsr()

print(X_train_tfidf_rc.shape, y_train.shape)
print(X_test_tfidf_rc.shape, y_test.shape)

(20000, 5016) (20000,)
(5000, 5016) (5000,)


In [38]:
sparse.save_npz('X_train_tfidf_rc', X_train_tfidf_rc)
sparse.save_npz('X_test_tfidf_rc', X_test_tfidf_rc)

In [39]:
# TFIDF W2V Response coded
X_train_tfidf_w2v_rc = hstack((teacher_prefix_train_df, \
                            school_state_train_df, \
                            project_grade_category_train_df, \
                            project_subject_categories_train_df, \
                            project_subject_subcategories_train_df, \
                            X_train_tfidf_w2v, \
                            X_train_price_normalized, \
                            X_train_prev_proj_norm, \
                            X_train_sentiment)).tocsr()

X_test_tfidf_w2v_rc = hstack((teacher_prefix_test_df, \
                            school_state_test_df, \
                            project_grade_category_test_df, \
                            project_subject_categories_test_df, \
                            project_subject_subcategories_test_df, \
                            X_test_tfidf_w2v, \
                            X_test_price_normalized, \
                            X_test_prev_proj_norm, \
                            X_test_sentiment)).tocsr()

print(X_train_tfidf_w2v_rc.shape, y_train.shape)
print(X_test_tfidf_w2v_rc.shape, y_test.shape)

(20000, 316) (20000,)
(5000, 316) (5000,)


In [40]:
sparse.save_npz('X_train_tfidf_w2v_rc', X_train_tfidf_w2v_rc)
sparse.save_npz('X_test_tfidf_w2v_rc', X_test_tfidf_w2v_rc)

In [41]:
np.save('y_train', y_train)
np.save('y_test', y_test)