In [55]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize
import nltk
from string import punctuation
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

In [5]:
stop_words = nltk.corpus.stopwords.words('english')
token_space = tokenize.WhitespaceTokenizer()
token_punct = tokenize.WordPunctTokenizer()

def transformSentence(list_of_sentences):
    
    sentences_after_stopwords = list()
    
    for sentence in list_of_sentences:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if word not in stop_words:
                new_sentence.append(word)
        sentences_after_stopwords.append(" ".join(new_sentence))

    sentences_after_stopwords_puncts = list()

    for sentence in sentences_after_stopwords:
        for punct_to_change in punctuation:
            sentence = sentence.replace(punct_to_change,"")
        sentences_after_stopwords_puncts.append(sentence)

    sentences_after_stopwords_puncts_lower = list()

    for sentence in sentences_after_stopwords_puncts:
        sentence = sentence.lower()
        sentences_after_stopwords_puncts_lower.append(sentence)

    stop_words_no_accent = list()

    for word in stop_words:
        for punct_to_change in punctuation:
            word = word.replace(punct_to_change,"")
        stop_words_no_accent.append(word)
    
    sentences_after_stopwords_puncts_lower_stopwords = list()

    for sentence in sentences_after_stopwords_puncts_lower:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if word not in stop_words_no_accent:
                new_sentence.append(word)
        sentences_after_stopwords_puncts_lower_stopwords.append(" ".join(new_sentence))

    sentences_after_stopwords_puncts_lower_stopwords_number = list()

    for sentence in sentences_after_stopwords_puncts_lower_stopwords:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if not word.isnumeric():
                new_sentence.append(word)
            else:
                new_sentence.append("0")
        sentences_after_stopwords_puncts_lower_stopwords_number.append(" ".join(new_sentence))

    return sentences_after_stopwords_puncts_lower_stopwords_number

In [6]:
colnames=['output','text']
df = pd.read_csv('all-data.csv', names=colnames, header=None)
all_sentences = [text for text in df.text]
words = ' '.join(all_sentences)

treated_sentences = transformSentence(list(df.text))

tfidf = TfidfVectorizer(lowercase=False,max_features=600)
vector_tfidf = tfidf.fit_transform(treated_sentences)

smote = SMOTE(random_state=100)
X_resampled, Y_resampled = smote.fit_resample(vector_tfidf,df.output)

test_size=0.1
X_train,X_test,Y_train,Y_test = train_test_split(X_resampled,Y_resampled,random_state = 100,test_size=test_size)

In [7]:
randomForest = RandomForestClassifier(n_estimators = 300,max_depth=None, min_samples_leaf = 1, max_features = 'log2', criterion = 'entropy', random_state = 100)
randomForest.fit(X_train,Y_train)
randomForest.score(X_test,Y_test)

0.875

In [57]:
proba_results = pd.DataFrame(randomForest.predict_proba(X_test),columns=randomForest.classes_)
proba_results['highest'] = proba_results[["negative", "neutral", "positive"]].max(axis=1)
proba_results['prediction'] = randomForest.predict(X_test)
proba_results['answer'] = list(Y_test)


In [58]:
proba_results['accuracy'] = np.where(proba_results['prediction']==proba_results['answer'], 1, 0)

In [76]:
baseline_highest = 0.90



query = ('highest >= {}').format(baseline_highest)
proba_results_filtered = proba_results.query(query)
print("classifing",len(proba_results_filtered.accuracy),"from",len(proba_results.accuracy))
print("precision:",proba_results_filtered.accuracy.sum()/len(proba_results_filtered.accuracy))

classifing 187 from 864
precision: 0.9893048128342246
