In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import tokenize
import nltk
from gensim.models import KeyedVectors
from string import punctuation
import unidecode
from sklearn.model_selection import train_test_split
from sklearn import metrics
from imblearn.over_sampling import SMOTE
import numpy as np

# Uploading Dataset

In [2]:
colnames=['output','text']
df = pd.read_csv('all-data.csv', names=colnames, header=None)
all_sentences = [text for text in df.text]
words = ' '.join(all_sentences)

In [3]:
stop_words = nltk.corpus.stopwords.words('english')
token_space = tokenize.WhitespaceTokenizer()
token_punct = tokenize.WordPunctTokenizer()

# Function to transform and clean list of texts

In [5]:
def transformSentence(list_of_sentences):
    
    sentences_after_stopwords = list()
    
    for sentence in list_of_sentences:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if word not in stop_words:
                new_sentence.append(word)
        sentences_after_stopwords.append(" ".join(new_sentence))

    sentences_after_stopwords_puncts = list()

    for sentence in sentences_after_stopwords:
        for punct_to_change in punctuation:
            sentence = sentence.replace(punct_to_change,"")
        sentences_after_stopwords_puncts.append(sentence)

    sentences_after_stopwords_puncts_lower = list()

    for sentence in sentences_after_stopwords_puncts:
        sentence = sentence.lower()
        sentences_after_stopwords_puncts_lower.append(sentence)

    stop_words_no_accent = list()

    for word in stop_words:
        for punct_to_change in punctuation:
            word = word.replace(punct_to_change,"")
        stop_words_no_accent.append(word)
    
    sentences_after_stopwords_puncts_lower_stopwords = list()

    for sentence in sentences_after_stopwords_puncts_lower:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if word not in stop_words_no_accent:
                new_sentence.append(word)
        sentences_after_stopwords_puncts_lower_stopwords.append(" ".join(new_sentence))

    sentences_after_stopwords_puncts_lower_stopwords_number = list()

    for sentence in sentences_after_stopwords_puncts_lower_stopwords:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if not word.isnumeric():
                new_sentence.append(word)
            else:
                new_sentence.append("0")
        sentences_after_stopwords_puncts_lower_stopwords_number.append(" ".join(new_sentence))

    return sentences_after_stopwords_puncts_lower_stopwords_number

In [14]:
treated_sentences = transformSentence(list(df.text))

Checking the imbalance between classes

In [26]:
tfidf = TfidfVectorizer(lowercase=False,max_features=600)
vector_tfidf = tfidf.fit_transform(treated_sentences)
print(vector_tfidf.shape)
print(df.output.shape)
print(df.output.value_counts())

(4846, 600)
(4846,)
neutral     2879
positive    1363
negative     604
Name: output, dtype: int64


# Balancing the classes

In [33]:
smote = SMOTE(random_state=100)
X_resampled, Y_resampled = smote.fit_resample(vector_tfidf,df.output)
print(X_resampled.shape)
print(Y_resampled.shape)
print(Y_resampled.value_counts())

(8637, 600)
(8637,)
neutral     2879
positive    2879
negative    2879
Name: output, dtype: int64


# Spliting in Train and Test

In [47]:
test_size=0.1
X_train,X_test,Y_train,Y_test = train_test_split(X_resampled,Y_resampled,random_state = 100,test_size=test_size)

In [48]:
print("train:",X_train.shape)
print("test:",X_test.shape)

train: (7773, 600)
test: (864, 600)


# Models Testing

Logistic Regression

In [67]:
from sklearn.linear_model import LogisticRegression

In [59]:
logistic_regression = LogisticRegression(solver = 'lbfgs',penalty = 'l2',max_iter = 5000)
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7557870370370371

In [57]:
logistic_regression = LogisticRegression(solver = 'lbfgs',penalty = 'none',max_iter = 5000)
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7800925925925926

In [61]:
logistic_regression = LogisticRegression(solver = 'sag',penalty = 'l2',max_iter = 5000)
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7557870370370371

In [60]:
logistic_regression = LogisticRegression(solver = 'sag',penalty = 'none',max_iter = 5000)
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7777777777777778

Multinomial Naive Bayes

In [66]:
from sklearn.naive_bayes import MultinomialNB

In [73]:
naive_bayes = MultinomialNB(alpha=1)
naive_bayes.fit(X_train,Y_train)
naive_bayes.score(X_test,Y_test)

0.6759259259259259

Random Forest Classifier

In [74]:
from sklearn.ensemble import RandomForestClassifier

In [94]:
randomForest = RandomForestClassifier(random_state= 100)
randomForest.fit(X_train,Y_train)
randomForest.score(X_test,Y_test)

0.8587962962962963

In [81]:
from sklearn.model_selection import RandomizedSearchCV

In [100]:
param_grid = {
    "n_estimators": np.arange(10, 500, step=10),
    "criterion": ['gini', 'entropy'],
    "max_features": ["auto", "sqrt", "log2"],
    "max_depth": np.arange(2, 10, step=1),
    "min_samples_split": np.arange(2, 10, step=2),
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True,False],
}
randomForest = RandomForestClassifier(random_state=100)
random_cv = RandomizedSearchCV(randomForest, param_grid, n_iter=80, cv=5, n_jobs=-1, random_state = 100)
rcv = random_cv.fit(X_train, Y_train)

In [101]:
rcv.best_params_

{'n_estimators': 260,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 9,
 'criterion': 'gini',
 'bootstrap': False}

In [102]:
randomForest = RandomForestClassifier(**rcv.best_params_, random_state = 100)
randomForest.fit(X_train,Y_train)
randomForest.score(X_test,Y_test)

0.65625