In [7]:
import pandas as pd
import numpy as np
import zipfile
from tqdm import tqdm
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.stats import entropy
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import ward, dendrogram
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import fcluster
from scipy.sparse import csr_matrix

RANDOM_SEED = 42

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
!pip install -q kaggle
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"chrisdallago","key":"47505734214414419614bf9292bbbfbf"}'}

In [3]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle competitions download -c umich-siads-695-predicting-text-difficulty

Downloading additional_resource_file_readme.txt to /content
  0% 0.00/2.83k [00:00<?, ?B/s]
100% 2.83k/2.83k [00:00<00:00, 2.42MB/s]
Downloading WikiLarge_Test.csv.zip to /content
  0% 0.00/4.73M [00:00<?, ?B/s]
100% 4.73M/4.73M [00:00<00:00, 43.5MB/s]
Downloading dale_chall.txt to /content
  0% 0.00/18.3k [00:00<?, ?B/s]
100% 18.3k/18.3k [00:00<00:00, 19.0MB/s]
Downloading Concreteness_ratings_Brysbaert_et_al_BRM.txt.zip to /content
  0% 0.00/410k [00:00<?, ?B/s]
100% 410k/410k [00:00<00:00, 128MB/s]
Downloading AoA_51715_words.csv.zip to /content
  0% 0.00/834k [00:00<?, ?B/s]
100% 834k/834k [00:00<00:00, 118MB/s]
Downloading WikiLarge_Train.csv.zip to /content
 84% 17.0M/20.3M [00:00<00:00, 61.6MB/s]
100% 20.3M/20.3M [00:00<00:00, 80.6MB/s]
Downloading sampleSubmission.csv.zip to /content
  0% 0.00/278k [00:00<?, ?B/s]
100% 278k/278k [00:00<00:00, 81.4MB/s]


In [5]:
!ls

additional_resource_file_readme.txt		  sample_data
AoA_51715_words.csv.zip				  sampleSubmission.csv.zip
Concreteness_ratings_Brysbaert_et_al_BRM.txt.zip  WikiLarge_Test.csv.zip
dale_chall.txt					  WikiLarge_Train.csv.zip
kaggle.json


In [8]:
zf = zipfile.ZipFile('/content/WikiLarge_Train.csv.zip') 
train_df = pd.read_csv(zf.open('WikiLarge_Train.csv'))

In [9]:
zf = zipfile.ZipFile('/content/WikiLarge_Test.csv.zip') 
test_df = pd.read_csv(zf.open('WikiLarge_Test.csv'))

In [11]:
tenK_train = train_df.sample(10000)
onehK_train = train_df.sample(100000)
twohK_train = train_df.sample(200000)

Vectorizer

In [12]:
# Vectorize the text data
# Defaults tested with min_df=100, later reduced to 25, and ngrams stayed at (1,2)
vectorizer = TfidfVectorizer(min_df = 100, stop_words ='english', ngram_range = (1, 2)) 
X_train    = vectorizer.fit_transform(tenK_train['original_text'])
vec_test   = vectorizer.transform(test_df['original_text']) 
# Get shuffle split params set up
shuff_spt = ShuffleSplit(test_size=0.5, train_size=0.5, n_splits=10)

Dummy CLF

In [13]:
# Train the random, uniform dummy clasifier
random_clf = DummyClassifier(strategy="uniform", random_state=RANDOM_SEED)
scores = cross_val_score(random_clf, train_df.original_text, train_df.label, cv=5)
scores.mean()

0.5006958307739798

In [14]:
#comparison with stratified, shuffle and loo cv
loo = LeaveOneOut()
shuff_spt = ShuffleSplit(test_size=0.5, train_size=0.5, n_splits=10)
scores = cross_val_score(random_clf, train_df.original_text, train_df.label, cv=shuff_spt)
scores

array([0.50065264, 0.49934256, 0.49934736, 0.50110373, 0.5004031 ,
       0.50109893, 0.49927538, 0.49910742, 0.50037431, 0.49978405])

In [8]:
random_clf.fit(train_df.original_text, train_df.label)
randclf_preds = random_clf.predict(test_df)

In [9]:
# Add id's to preds, write to CSV
randclf_pred_df = pd.DataFrame(data=randclf_preds).reset_index()
randclf_pred_df.columns = ['id', 'label']
randclf_pred_df.to_csv('randclf_preds.csv', index=False)

Random Forest CLF

In [19]:
# Train Random Forest Classifier with default parameters, with hyperparameter tuning, n_estimators=50 and max_depth=15 slightly increasing the accuracy on the 10000K sample
rf_clf = RandomForestClassifier(random_state=RANDOM_SEED) 

In [20]:
scores = cross_val_score(rf_clf, X_train, tenK_train.label, cv=shuff_spt) 
scores.mean() 

0.56834

In [12]:
# Add id's to preds, write to CSV
rf_clf.fit(X_train, tenK_train.label)
rf_preds = rf_clf.predict(vec_test)
rf_clf_pred_df = pd.DataFrame(data = rf_preds).reset_index()
rf_clf_pred_df.columns = ['id', 'label']
rf_clf_pred_df.to_csv('rf_preds.csv', index=False)

SVM CLF

In [47]:
# Train svm SVC classifier with default parameters, eventually set kernel='linear'
# Experimented with changing the C value to (0,0.01,0.1,0.5,1,2,5,10,50,100,500,1000) with the highest accuracy achieved at default C=1.0
svm_clf = svm.SVC(random_state=RANDOM_SEED) 

In [48]:
scores = cross_val_score(svm_clf, X_train, tenK_train.label, cv=shuff_spt) 
scores.mean() 

0.5656800000000001

In [15]:
# Add id's to preds, write to CSV
svm_clf.fit(X_train, tenK_train.label)
svm_preds = svm_clf.predict(vec_test)
svm_clf_pred_df = pd.DataFrame(data = svm_preds).reset_index()
svm_clf_pred_df.columns = ['id', 'label']
svm_clf_pred_df.to_csv('svm_preds.csv', index=False)

CPU times: user 43.7 s, sys: 157 ms, total: 43.9 s
Wall time: 43.7 s


Age Of Acquistion

In [49]:
# Separate dataset based on difficulty label
train_simple_df = tenK_train[tenK_train['label'] == 0].reset_index(drop=True)
train_difficult_df = tenK_train[tenK_train['label'] == 1].reset_index(drop=True)

In [None]:
zf = zipfile.ZipFile('/content/WikiLarge_Test.csv.zip') 
test_df = pd.read_csv(zf.open('WikiLarge_Test.csv'))

In [60]:
# Begin experimenting with AoA csv
zf = zipfile.ZipFile('/content/AoA_51715_words.csv.zip') 
aoa_list = pd.read_csv(zf.open('AoA_51715_words.csv'))#, encoding='unicode_escape')
aoa_update = aoa_list[['Word', 'Lemma_highest_PoS', 'AoA_Kup_lem']]

In [61]:
# Compare averages of statistical metrics  
def get_avg_stats(df):
    aoa_mean = []
    aoa_median = []
    aoa_min = []
    aoa_max = []
    
    for sent in df.original_text[:10000]:
        sent = sent.lower().split(' ')
        sent = list(set(sent))
        l = []
        nan = []
        for i in sent:
            i = lemma.lemmatize(i)
            if i in list(aoa_update.Lemma_highest_PoS):
                l.append(float(aoa_update[aoa_update.Word == i]['AoA_Kup_lem']))
            else:
                nan.append(np.nan)

        if len(l) >= 1:
            aoa_mean.append(np.mean(l))
            aoa_median.append(np.median(l))
            aoa_min.append(np.min(l))
            aoa_max.append(np.max(l))
            
    return (np.mean(aoa_mean), np.mean(aoa_median), np.mean(aoa_min), np.mean(aoa_max))

In [62]:
print('Simple: ', get_avg_stats(train_simple_df))
print('Difficult: ', get_avg_stats(train_difficult_df))

KeyboardInterrupt: ignored

In [63]:
# Create a new Pandas DataFrame to experiment with AoA stats
def get_stats_df(X, y, sample_size):
    
    split_df = pd.DataFrame(X)
    split_df['label'] = y
    sample_df = split_df[:sample_size].copy()

    aoa_mean = []
    aoa_median = []
    aoa_min = []
    aoa_max = []

    for sent in tqdm(sample_df.original_text):
        sent = sent.lower().split(' ')
        sent = list(set(sent))
        l = []
        nan = []
        for i in sent:
            i = lemma.lemmatize(i)
            if i in list(aoa_update.Lemma_highest_PoS):
                l.append(float(aoa_update[aoa_update.Word == i]['AoA_Kup_lem']))
            else:
                nan.append(np.nan)

        if len(l) >= 1:
            aoa_mean.append(np.mean(l))
            aoa_median.append(np.median(l))
            aoa_min.append(np.min(l))
            aoa_max.append(np.max(l))
            
        else:
            aoa_mean.append(np.nan)
            aoa_median.append(np.nan)
            aoa_min.append(np.nan)
            aoa_max.append(np.nan)
            
    sample_df['mean_aoa'] = aoa_mean
    sample_df['median_aoa'] = aoa_median
    sample_df['min_aoa'] = aoa_min
    sample_df['max_aoa'] = aoa_max
    
    return sample_df

In [64]:
# Need to drop NaN values for this portion or they with cause the classifier training to result in error
train_stats_df = get_stats_df(tenK_train.original_text, tenK_train.label, 10000)
train_stats_df = train_stats_df.dropna()

 27%|██▋       | 2660/10000 [06:05<16:47,  7.28it/s]


KeyboardInterrupt: ignored

In [None]:
# Compare the accuracy of using only the AoA stats without vectorized text data
train_stats_X = train_stats_df[['mean_aoa', 'median_aoa', 'min_aoa', 'max_aoa']]
train_stats_y = np.array(train_stats_df['label'])

In [None]:
# Cross val score actually higher when using AoA stats than with the vectorized text on 10000K sample
lg_stats_clf = LogisticRegression(random_state=RANDOM_SEED)
scores = cross_val_score(lg_stats_clf, train_stats_X, train_stats_y, cv=shuff_spt)
scores.mean()

Add in Vector Data


In [192]:
# Experiment with combining vectorized text data with AoA stats to further improve accuracy
train_vecs = []

for i in tqdm(range(len(train_stats_df))):
    train_vecs.append(np.array(pd.DataFrame.sparse.from_spmatrix(X_train).iloc[i].unique()))
train_stats_df['vectorized_text'] = train_vecs

train_vec_mean_pure = []
train_vec_mean_slice = []

for i in train_stats_df['vectorized_text']:
    train_vec_mean_pure.append(np.mean(i))
    if len(i) > 1:
        train_vec_mean_slice.append(np.mean(i[1:]))
    else:
        train_vec_mean_slice.append(np.mean(i))

# Create two separate vector average metrics, one including 0.0 one slicing it out
train_stats_df['vec_mean_pure'] = train_vec_mean_pure
train_stats_df['vec_mean_slice'] = train_vec_mean_slice

100%|██████████| 9779/9779 [00:30<00:00, 321.95it/s]


In [240]:
# Combining both mean vector calculation slightly increases the accuracy of the model
train_stats_X = train_stats_df[['mean_aoa', 'median_aoa', 'min_aoa', 'max_aoa','vec_mean_pure', 'vec_mean_slice']]
train_stats_y = np.array(train_stats_df['label'])

In [241]:
lg_stats_clf = LogisticRegression(random_state=RANDOM_SEED)
scores = cross_val_score(lg_stats_clf, train_stats_X, train_stats_y, cv=shuff_spt)
scores.mean()

0.6056646216768916

Unsupervised Learning

In [195]:
# Initiate functions for unsupervised clustering
def get_original_text_vectorized(df, top_n = -1, ngram_range = (1,2), max_features = 1000):
    df = df.dropna()   
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=max_features,
                                 min_df=2, stop_words='english',
                                 ngram_range = ngram_range,
                                 use_idf=True)
    if (top_n >= 0):
        text_instances = df.values[0:top_n]
    else:
        text_instances = df.values
    
    X = vectorizer.fit_transform(text_instances) 
    
    return (X, vectorizer, text_instances)

def print_cluster_features(vectorizer, centroids, n_clusters, top_n_features):
    terms = vectorizer.get_feature_names()
    for i in range(n_clusters):
        print("Cluster %d:" % i, end='')
        for ind in centroids[i, :top_n_features]:
            print(' [%s]' % terms[ind], end='')
        print()
        
def kmeans_review(n_clusters, centroids, terms):
    result = None
    result = []
    
    for i in range(n_clusters):
        term_list = list()
        for ind in centroids[i, :10]:
            term_list.append(terms[ind])
        result.append(term_list)
    
    return result

def compute_distinctive_term_score(T, T_a):   
    IG = entropy(T) - entropy(T_a) 
    if (T_a[0] < T_a[1]):
        score = -IG  
    else:
        score = IG
    return score

def one_vs_all_count_matrix(m, index):
    row0 = m[index, :]
    row1 = np.vstack((m[0:index, :], m[index+1:, :])).sum(axis=0)
              
    result = np.vstack((row0, row1))
    return result

In [196]:
# Compare scoring methods for ideal number of K clusters
def get_cluster_number(df):
    
    (X, vectorizer, text_instances) = get_original_text_vectorized(df)

    db_list = []
    ch_list = []

    for k in range(2,9):
        kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state=RANDOM_SEED).fit(X)
        labels = kmeans.labels_
        db_list.append(metrics.davies_bouldin_score(X.toarray(), labels))
        ch_list.append(metrics.calinski_harabasz_score(X.toarray(), labels))

    return (('Davies Bouldin:', db_list.index(min(db_list))+2), ('Calinski Harabasz:', ch_list.index(max(ch_list))+2))

In [197]:
print('Simple:', get_cluster_number(train_simple_df['original_text']))
print('Difficult:', get_cluster_number(train_difficult_df['original_text']))

Simple: (('Davies Bouldin:', 8), ('Calinski Harabasz:', 2))
Difficult: (('Davies Bouldin:', 2), ('Calinski Harabasz:', 4))


Kmeans Clustering

In [198]:
# Initiate functions to get simple and difficult clusters
def get_simple_clusters():
    (X, vectorizer, text_instances) = get_original_text_vectorized(train_simple_df['original_text'])
    
    n_clusters = get_cluster_number(train_simple_df['original_text'])[1][1]
    top_n_features=10
    kmeans = KMeans(n_clusters, init='k-means++', max_iter=100, n_init=1, random_state=RANDOM_SEED).fit(X)
    labels = kmeans.labels_
    centroids = np.argsort(-kmeans.cluster_centers_)
    terms = vectorizer.get_feature_names()

    #print_cluster_features(vectorizer, centroids, n_clusters=8, top_n_features=10)
    #kmeans_review()
    cluster_df_simple = pd.DataFrame(data=kmeans_review(n_clusters, centroids, terms)).T
    cluster_df_simple.columns = ['Simple Cluster %d' % i for i in range(n_clusters)]
    return cluster_df_simple


def get_difficult_clusters():
    (X, vectorizer, text_instances) = get_original_text_vectorized(train_difficult_df['original_text'])
    
    n_clusters = get_cluster_number(train_difficult_df['original_text'])[1][1]
    top_n_features=10
    kmeans = KMeans(n_clusters, init='k-means++', max_iter=100, n_init=1, random_state=RANDOM_SEED).fit(X)
    labels = kmeans.labels_
    centroids = np.argsort(-kmeans.cluster_centers_)
    terms = vectorizer.get_feature_names()

    #print_cluster_features(vectorizer, centroids, n_clusters=8, top_n_features=10)
    #kmeans_review()
    cluster_df_difficult = pd.DataFrame(data=kmeans_review(n_clusters, centroids, terms)).T
    cluster_df_difficult.columns = ['Difficult Cluster %d' % i for i in range(n_clusters)]
    return cluster_df_difficult


In [199]:
cluster_df = get_simple_clusters().join(get_difficult_clusters())
cluster_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Simple Cluster 0,rrb,lrb,born,lrb born,player,football player,football,lrb rrb,american,december
Simple Cluster 1,city,used,commune,called,united,people,states,france,region,united states
Difficult Cluster 0,commune,france,department,calais,pas calais,pas,region,northern france,northern,aisne
Difficult Cluster 1,water,rrb,lrb,used,small,surface,inside,moon,power,simply
Difficult Cluster 2,rrb,lrb,born,lrb born,lrb rrb,known,american,football,german,ndash
Difficult Cluster 3,city,county,united,states,world,used,known,united states,new,time


Cluster Labeling

In [200]:
# Initiate function to get cluster labels using information gain as a means of dimentionality reduction as feature selection criteria to find terms that are distinct to that cluster
def cluster_labeling(n_clusters, L, terms):
    result = None
    
    scores_list = []

    for row in range(len(L)):
        c = []
        T_c = one_vs_all_count_matrix(L, row).sum(axis=1)
        T_c = (T_c / sum(T_c))
        for i in range(len(L[row])):
            T_w = one_vs_all_count_matrix(L, row).T
            T_w = (T_w[i] / sum(T_w[i]))
            c.append(compute_distinctive_term_score(T_c, T_w))
        scores_list.append(c)
    
    ind_list = list(np.argsort(-np.array(scores_list)))
    
    top_5 = []
    for i in ind_list:
        top_5.append(list(i[:5]))
        
    result = []
    for i in range(len(top_5)):
        term_list = list()
        for ind in top_5[i]:
            term_list.append(terms[ind])
        result.append(term_list)
        
    return result


def get_simple_cluster_labels():
    (X, vectorizer, text_instances) = get_original_text_vectorized(train_simple_df['original_text'])
    
    n_clusters = get_cluster_number(train_simple_df['original_text'])[1][1]
    kmeans = KMeans(n_clusters, init='k-means++', max_iter=100, n_init=1, random_state=RANDOM_SEED).fit(X)
    labels = kmeans.labels_
    L = kmeans.cluster_centers_
    terms = vectorizer.get_feature_names()
    
    cluster_labeling_simple_df = pd.DataFrame(data=cluster_labeling(n_clusters, L, terms)).T
    cluster_labeling_simple_df.columns = ['Simple Cluster %d' % i for i in range(n_clusters)]
    
    return cluster_labeling_simple_df

def get_difficult_cluster_labels():
    (X, vectorizer, text_instances) = get_original_text_vectorized(train_difficult_df['original_text'])
    
    n_clusters = get_cluster_number(train_difficult_df['original_text'])[1][1]
    kmeans = KMeans(n_clusters, init='k-means++', max_iter=100, n_init=1, random_state=RANDOM_SEED).fit(X)
    labels = kmeans.labels_
    L = kmeans.cluster_centers_
    terms = vectorizer.get_feature_names()
    
    cluster_labeling_difficult_df = pd.DataFrame(data=cluster_labeling(n_clusters, L, terms)).T
    cluster_labeling_difficult_df.columns = ['Difficult Cluster %d' % i for i in range(n_clusters)]
    
    return cluster_labeling_difficult_df

In [201]:
cluster_labeling_df = get_simple_cluster_labels().join(get_difficult_cluster_labels())
cluster_labeling_df

Unnamed: 0,Simple Cluster 0,Simple Cluster 1,Difficult Cluster 0,Difficult Cluster 1,Difficult Cluster 2,Difficult Cluster 3
0,rrb brazilian,performance,calais,water,rrb german,zone
1,rrb spanish,calais pas,pas,operated,rrb lrb,outside
2,lrb born,la loire,northern france,energy,rrb known,nintendo
3,lrb died,calvados department,calais region,does,rrb italian,iowa united
4,player plays,cancer,commune pas,inside,rrb english,iowa


In [202]:
# Attempt to reintroduce cluster label vectors into the supervised learning prediction task
train_cluster1 = []
train_cluster2 = []

vec_cluster1 = []
vec_cluster2 = []

for i in train_stats_df.label:
    if i == 0:
        train_cluster1.append(list(cluster_labeling_df['Simple Cluster 0']))
        vec_cluster1.append(np.mean(list(pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(cluster_labeling_df['Simple Cluster 0'])).iloc[i].unique())))
        train_cluster2.append(list(cluster_labeling_df['Simple Cluster 1']))
        vec_cluster2.append(np.mean(list(pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(cluster_labeling_df['Simple Cluster 1'])).iloc[i].unique())))

    else:
        train_cluster1.append(list(cluster_labeling_df['Difficult Cluster 0']))
        vec_cluster1.append(np.mean(list(pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(cluster_labeling_df['Difficult Cluster 0'])).iloc[i].unique())))
        train_cluster2.append(list(cluster_labeling_df['Difficult Cluster 1']))
        vec_cluster2.append(np.mean(list(pd.DataFrame.sparse.from_spmatrix(vectorizer.fit_transform(cluster_labeling_df['Difficult Cluster 1'])).iloc[i].unique())))


train_stats_df['cluster_1_label'] = train_cluster1
train_stats_df['cluster_2_label'] = train_cluster2

train_stats_df['cluster_label_vec_1'] = vec_cluster1
train_stats_df['cluster_label_vec_2'] = vec_cluster2


In [225]:
# Add mean cluster label vector into train stats for the classifier
train_stats_X = train_stats_df[['mean_aoa', 'median_aoa', 'min_aoa', 'max_aoa',
                                'vec_mean_pure', 'vec_mean_slice', 
                                'cluster_label_vec_1']]
train_stats_y = np.array(train_stats_df['label'])

In [227]:
# Results in overinflated accuracy score
lg_stats_clf = LogisticRegression(random_state=RANDOM_SEED)
scores = cross_val_score(lg_stats_clf, train_stats_X, train_stats_y, cv=shuff_spt)
scores.mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9036809815950921