In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import tokenize
import nltk
from gensim.models import KeyedVectors
from string import punctuation
import unidecode
from sklearn.model_selection import train_test_split
from sklearn import metrics
from imblearn.over_sampling import SMOTE
import numpy as np

# Uploading Dataset

In [3]:
colnames=['output','text']
df = pd.read_csv('all-data.csv', names=colnames, encoding = "ISO-8859-1", header=None)
all_sentences = [text for text in df.text]
words = ' '.join(all_sentences)

In [4]:
stop_words = nltk.corpus.stopwords.words('english')
token_space = tokenize.WhitespaceTokenizer()
token_punct = tokenize.WordPunctTokenizer()

# Function to transform and clean list of texts

In [6]:
def transformSentence(list_of_sentences):
    
    sentences_after_stopwords = list()
    
    for sentence in list_of_sentences:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if word not in stop_words:
                new_sentence.append(word)
        sentences_after_stopwords.append(" ".join(new_sentence))

    sentences_after_stopwords_puncts = list()

    for sentence in sentences_after_stopwords:
        for punct_to_change in punctuation:
            sentence = sentence.replace(punct_to_change,"")
        sentences_after_stopwords_puncts.append(sentence)

    sentences_after_stopwords_puncts_lower = list()

    for sentence in sentences_after_stopwords_puncts:
        sentence = sentence.lower()
        sentences_after_stopwords_puncts_lower.append(sentence)

    stop_words_no_accent = list()

    for word in stop_words:
        for punct_to_change in punctuation:
            word = word.replace(punct_to_change,"")
        stop_words_no_accent.append(word)
    
    sentences_after_stopwords_puncts_lower_stopwords = list()

    for sentence in sentences_after_stopwords_puncts_lower:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if word not in stop_words_no_accent:
                new_sentence.append(word)
        sentences_after_stopwords_puncts_lower_stopwords.append(" ".join(new_sentence))

    sentences_after_stopwords_puncts_lower_stopwords_number = list()

    for sentence in sentences_after_stopwords_puncts_lower_stopwords:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if not word.isnumeric():
                new_sentence.append(word)
            else:
                new_sentence.append("0")
        sentences_after_stopwords_puncts_lower_stopwords_number.append(" ".join(new_sentence))

    return sentences_after_stopwords_puncts_lower_stopwords_number

In [7]:
treated_sentences = transformSentence(list(df.text))

Checking the imbalance between classes

In [8]:
tfidf = TfidfVectorizer(lowercase=False,max_features=600)
vector_tfidf = tfidf.fit_transform(treated_sentences)
print(vector_tfidf.shape)
print(df.output.shape)
print(df.output.value_counts())

(4846, 600)
(4846,)
neutral     2879
positive    1363
negative     604
Name: output, dtype: int64


# Balancing the classes

In [9]:
smote = SMOTE(random_state=100)
X_resampled, Y_resampled = smote.fit_resample(vector_tfidf,df.output)
print(X_resampled.shape)
print(Y_resampled.shape)
print(Y_resampled.value_counts())

(8637, 600)
(8637,)
neutral     2879
negative    2879
positive    2879
Name: output, dtype: int64


# Spliting in Train and Test

In [10]:
test_size=0.1
X_train,X_test,Y_train,Y_test = train_test_split(X_resampled,Y_resampled,random_state = 100,test_size=test_size)

In [11]:
print("train:",X_train.shape)
print("test:",X_test.shape)

train: (7773, 600)
test: (864, 600)


# Models Testing

Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
logistic_regression = LogisticRegression(solver = 'lbfgs',penalty = 'l2',max_iter = 5000)
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7615740740740741

In [14]:
logistic_regression = LogisticRegression(solver = 'lbfgs',penalty = 'none',max_iter = 5000)
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7800925925925926

In [15]:
logistic_regression = LogisticRegression(solver = 'sag',penalty = 'l2',max_iter = 5000)
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7615740740740741

In [16]:
logistic_regression = LogisticRegression(solver = 'sag',penalty = 'none',max_iter = 5000)
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7766203703703703

Multinomial Naive Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB

In [18]:
naive_bayes = MultinomialNB(alpha=1)
naive_bayes.fit(X_train,Y_train)
naive_bayes.score(X_test,Y_test)

0.6712962962962963

Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
randomForest = RandomForestClassifier(random_state= 100)
randomForest.fit(X_train,Y_train)
randomForest.score(X_test,Y_test)

0.8576388888888888

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [22]:
# param_grid = {
#     "criterion": ['gini','entropy'],
#     "min_samples_leaf": [1, 2, 4, 8],
#     "max_features": ["auto", "sqrt", "log2"]
# }

# randomForest = RandomForestClassifier(n_estimators = 300,max_depth=None, random_state=100)
# random_cv = RandomizedSearchCV(randomForest, param_grid, n_iter=20, cv=5, n_jobs=-1, random_state = 100)
# rcv = random_cv.fit(X_train, Y_train)
# rcv.best_params_

# randomForest = RandomForestClassifier(**rcv.best_params_, random_state = 100)
# randomForest.fit(X_train,Y_train)
# randomForest.score(X_test,Y_test)

In [23]:
randomForest = RandomForestClassifier(n_estimators = 300,max_depth=None, min_samples_leaf = 1, max_features = 'log2', criterion = 'entropy', random_state = 100)
randomForest.fit(X_train,Y_train)
randomForest.score(X_test,Y_test)

0.8761574074074074

Decision Tree Classifier

In [24]:
from sklearn.tree import DecisionTreeClassifier

In [25]:
decisionTree = DecisionTreeClassifier(random_state = 100)
decisionTree.fit(X_train,Y_train)
decisionTree.score(X_test,Y_test)

0.7731481481481481

In [26]:
param_grid = {
    "criterion": ['gini','entropy'],
    "splitter": ["best","random"],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": ["auto", "sqrt", "log2"]
}

decisionTree = DecisionTreeClassifier(random_state = 100)
random_cv = RandomizedSearchCV(decisionTree, param_grid, n_iter=20, cv=5, n_jobs=-1, random_state = 100)
rcv = random_cv.fit(X_train, Y_train)
rcv.best_params_

{'splitter': 'random',
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'criterion': 'entropy'}

In [27]:
decisionTree = DecisionTreeClassifier(**rcv.best_params_,random_state = 100)
decisionTree.fit(X_train,Y_train)
decisionTree.score(X_test,Y_test)

0.7662037037037037

Support Vector Machine

In [28]:
from sklearn import svm

In [29]:
support_vector = svm.SVC(random_state = 100)
support_vector.fit(X_train,Y_train)
support_vector.score(X_test,Y_test)

0.8657407407407407

In [30]:
param_grid = {
    "kernel": ['linear','poly','rbf','sigmoid'],
    "gamma": ['scale','auto'],
    "degree": [3,5,7,9]
}

support_vector = svm.SVC(random_state = 100)
random_cv = RandomizedSearchCV(support_vector, param_grid, n_iter=20, cv=5, n_jobs=-1, random_state = 100)
rcv = random_cv.fit(X_train, Y_train)
rcv.best_params_

{'kernel': 'rbf', 'gamma': 'scale', 'degree': 9}

In [31]:

support_vector = svm.SVC(**rcv.best_params_,random_state = 100)
support_vector.fit(X_train,Y_train)
support_vector.score(X_test,Y_test)

0.8657407407407407