In [6]:
import multiprocessing as mp
import time

import pandas as pd
import os
import re
from datetime import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction import text
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import multiprocessing as mp
import time

from concurrent import futures

worker_X = None
worker_Y = None
worker_min_df = None
worker_max_df = None
worker_stop_words = None

def worker_init(X,Y,min_df,max_df,stop_words):
    global worker_X
    global worker_Y 
    global worker_min_df
    global worker_max_df
    global worker_stop_words
    
    worker_X = X
    worker_Y = Y
    worker_min_df = min_df
    worker_max_df = max_df
    worker_stop_words = stop_words

def worker(max_features):
    global worker_X
    global worker_Y 
    global worker_min_df
    global worker_max_df
    global worker_stop_words
    
    start_time = time.time()
    
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, stop_words=stop_words, min_df=worker_min_df, max_df=worker_max_df, ngram_range=(1,3))

    feature_vector = tfidf_vectorizer.fit_transform(worker_X)
    fill_ratio = feature_vector.nnz/(feature_vector.shape[0]*feature_vector.shape[1])
    X_dense = feature_vector.todense()
       

    x_train, x_test, y_train, y_test = train_test_split(X_dense, worker_Y, test_size = 0.2)
    clf = GaussianNB().fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    acc, num_acc, prec, recall = summarize_classification(y_test, y_pred)
    
    elapsed_time = time.time()-start_time
    
    stats_row = {}
    stats_row['min_df'] = worker_min_df
    stats_row['max_df'] = worker_max_df
    stats_row['max_features'] = max_features
    stats_row['fill_ratio'] = fill_ratio
    stats_row['acc'] = acc
    stats_row['acc_count'] = num_acc
    stats_row['prec'] = prec
    stats_row['recall'] = recall
    stats_row['elapsed_time'] = elapsed_time
    
    return stats_row

def summarize_classification(y_test, y_pred):

    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    return acc, num_acc, prec, recall

def main(df):
    df = df[['category', 'text_scraped', 'text_scraped_words_count', 'text_lemmatized']]
    
    df_filtered = pd.DataFrame(columns=['text_scraped', 'category', 'text_scraped_words_count'])
    
    for item in df.category.value_counts().items():
        if(item[1] > 1000):
            df_filtered = df_filtered.append(df[df.category.str.contains(item[0])][0:1000][['text_lemmatized', 'category']])
    df = df_filtered
    
    wnl = WordNetLemmatizer()
    X = [" ".join([wnl.lemmatize(token) for token in text]) for text in df.text_lemmatized]
    Y = df.category 
    
    stop_words = text.ENGLISH_STOP_WORDS.union(string.punctuation)
    
    min_df_steps = range(1,17,1)
    max_df_steps = range(1000, 4200, 200)
    max_features_steps = range(100, 1060, 60)

    test_stats = pd.DataFrame(columns=['min_df', 'max_df', 'max_features', 'fill_ratio', 'acc', 'acc_count', 'prec', 'recall', 'elapsed_time'])

    num_cores = 2
    for min_df in min_df_steps:
        for max_df in max_df_steps:        
            inputs = [x for x in max_features_steps]
            stats = []
            with futures.ProcessPoolExecutor(max_workers=num_cores, initializer=worker_init, initargs=[X,Y,min_df,max_df,stop_words]) as pool:
                rows = [row for row in pool.map(worker, inputs)]
                stats.extend(rows)

    test_stats = test_stats.append(stats)

def get_newest_file(fn_list):
    dates = []
    for f in fn_list:
        match = re.search("([0-9]{2}-[0-9]{2}-[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{2})", f)
        if(match is not None):
            dates.append({'fileName': f, 'date': match.group()})
    
    dates = list(map(lambda x: {'fileName': x['fileName'], 'date': datetime.strptime(x['date'], "%m-%d-%Y-%H-%M-%S")}, dates))
    dates.sort(key=lambda x: x['date'], reverse=True)
    return dates[0]['fileName']

if __name__ == '__main__':
    fn_list = os.listdir("data/after-nlp-pipeline")
    fn_newest = get_newest_file(fn_list)

    f = open("data/after-nlp-pipeline/{}".format(fn_newest), 'r', encoding="utf-8")
    df = pd.read_csv(f, index_col=0, converters={'text_scraped_words': lambda x: x[1:-1].replace("'", "").split(', '),'text_lemmatized': lambda x: x[1:-1].replace("'", "").split(', ') })
    f.close()
    
    print("Successfully imported file: {}".format(fn_newest))
    
    main(df)

Successfully imported file: after-nlp-pipeline-09-16-2019-20-08-41.csv


TypeError: __init__() got an unexpected keyword argument 'initializer'

In [27]:
f = open("data/10-04-2019-16-12-49-stats.csv", 'r', encoding="utf-8")
df = pd.read_csv(f, index_col=0)

In [31]:
max_fill_ratio = df.fill_ratio.max()
max_recall = df.recall.max()
max_prec = df.prec.max()
max_acc = df.acc.max()

In [32]:
rows = df[df.fill_ratio==max_fill_ratio]
print("Parameters for max fill_ratio")
rows

Parameters for max fill_ratio


Unnamed: 0,min_df,max_df,max_features,fill_ratio,acc,acc_count,prec,recall,elapsed_time
240,1,4000,100,0.228021,0.3775,1057,0.365868,0.3775,21.80687
496,2,4000,100,0.228021,0.386786,1083,0.375574,0.386786,21.563884
752,3,4000,100,0.228021,0.379286,1062,0.369003,0.379286,21.68153
1008,4,4000,100,0.228021,0.388929,1089,0.374859,0.388929,21.788196
1264,5,4000,100,0.228021,0.369286,1034,0.357891,0.369286,21.875518
1520,6,4000,100,0.228021,0.384643,1077,0.3759,0.384643,21.973632
1776,7,4000,100,0.228021,0.385357,1079,0.370529,0.385357,21.73412
2032,8,4000,100,0.228021,0.382857,1072,0.372834,0.382857,22.657448
2288,9,4000,100,0.228021,0.383214,1073,0.370644,0.383214,22.215764
2544,10,4000,100,0.228021,0.389643,1091,0.380847,0.389643,21.679076


In [33]:
rows = df[df.recall==max_recall]
print("Parameters for max recall")
rows

Parameters for max recall


Unnamed: 0,min_df,max_df,max_features,fill_ratio,acc,acc_count,prec,recall,elapsed_time
253,1,4000,880,0.098756,0.488214,1367,0.481306,0.488214,22.343998
767,3,4000,1000,0.091371,0.488214,1367,0.483705,0.488214,22.502659


In [34]:
rows = df[df.prec==max_prec]
print("Parameters for max prec")
rows

Parameters for max prec


Unnamed: 0,min_df,max_df,max_features,fill_ratio,acc,acc_count,prec,recall,elapsed_time
974,4,3400,940,0.087392,0.482857,1352,0.484313,0.482857,22.579051


In [37]:
rows = df[df.acc==max_acc]
print("Parameters for max acc")
rows

Parameters for max acc


Unnamed: 0,min_df,max_df,max_features,fill_ratio,acc,acc_count,prec,recall,elapsed_time
253,1,4000,880,0.098756,0.488214,1367,0.481306,0.488214,22.343998
767,3,4000,1000,0.091371,0.488214,1367,0.483705,0.488214,22.502659


In [38]:
import multiprocessing as mp
import time

import pandas as pd
import os
import re
from datetime import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction import text
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import multiprocessing as mp
import time

from concurrent import futures

In [39]:
wnl = WordNetLemmatizer()
X = [" ".join([wnl.lemmatize(token) for token in text]) for text in df.text_lemmatized]
Y = df.category 

stop_words = text.ENGLISH_STOP_WORDS.union(string.punctuation)

tfidf_vectorizer = TfidfVectorizer(max_features=max_features, stop_words=stop_words, min_df=min_df, max_df=max_df, ngram_range=(1,3))

feature_vector = tfidf_vectorizer.fit_transform(X)
fill_ratio = feature_vector.nnz/(feature_vector.shape[0]*feature_vector.shape[1])
X_dense = feature_vector.todense()


x_train, x_test, y_train, y_test = train_test_split(X_dense, Y, test_size = 0.2)
clf = GaussianNB().fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc, num_acc, prec, recall = summarize_classification(y_test, y_pred)
acc, num_acc, prec, recall

AttributeError: 'DataFrame' object has no attribute 'text_lemmatized'