In [1]:
import pandas as pd
import numpy as np
import pickle
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
words,labels = pickle.load(open('words_labels.pkl','rb'))

In [3]:
def get_lemma(word,lemmatizer=WordNetLemmatizer()):
    _word = word.split(' ')
    for i,_w in enumerate(_word):
        _word[i] = lemmatizer.lemmatize(_w.lower())
    return ' '.join(_word)

words = np.array([get_lemma(word) for word in words])

In [4]:
vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS,max_df=0.2)

features_train, features_test, labels_train, labels_test = train_test_split(words, labels, test_size=0.1)
features_train = vectorizer.fit_transform(features_train).toarray()
features_test  = vectorizer.transform(features_test).toarray()

clf = RandomForestClassifier(n_estimators=100,n_jobs=-1,class_weight={0:1,1:5})
clf.fit(features_train,labels_train)
print(clf.score(features_test,labels_test))
print(clf.score(features_test[labels_test==0],labels_test[labels_test==0]))
print(clf.score(features_test[labels_test==1],labels_test[labels_test==1]))
del clf, vectorizer;

0.7894736842105263
0.8
0.7222222222222222


In [5]:
vectorizer = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS,max_df=0.2)
vectorizer.fit(words)

def get_vect(x,vectorizer=vectorizer):
    return vectorizer.transform(x).toarray()

In [6]:
_words = get_vect(words)
clf = RandomForestClassifier(n_estimators=500,n_jobs=-1,class_weight={0:1,1:5})
clf.fit(_words,labels)
print(clf.score(_words,labels))
print(clf.score(_words[labels==0],labels[labels==0]))
print(clf.score(_words[labels==1],labels[labels==1]))

0.9969788519637462
0.9966187658495351
1.0


In [7]:
def is_compost(x):
    return bool(int(clf.predict(get_vect(np.array([get_lemma(x)])))))

In [8]:
is_compost('Food Scrap')

True

In [9]:
is_compost('Metal Scrap')

False

In [10]:
is_compost('Food')

True

In [11]:
is_compost('Green')

True

In [12]:
pickle.dump((clf,vectorizer),
           open('CompostClassifier.pkl','wb'))