In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk import tokenize
import nltk
from gensim.models import KeyedVectors
from string import punctuation
import unidecode
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

Preparing Data

In [4]:
colnames=['output','text']
df = pd.read_csv('all-data.csv', names=colnames, header=None)

In [5]:
all_sentences = [text for text in df.text]
words = ' '.join(all_sentences)

In [6]:
stop_words = nltk.corpus.stopwords.words('english')
token_space = tokenize.WhitespaceTokenizer()
token_punct = tokenize.WordPunctTokenizer()

In [7]:
puncts = list()
for punct in punctuation:
    puncts.append(punct)
punct_and_stopwords = puncts + stop_words

In [8]:
sentences_after_stopwords = list()

for sentence in df.text:
    new_sentence = list()
    words_from_sentence = token_space.tokenize(sentence)
    for word in words_from_sentence:
        if word not in stop_words:
            new_sentence.append(word)
    sentences_after_stopwords.append(" ".join(new_sentence))

In [9]:
sentences_after_stopwords_puncts = list()

for sentence in sentences_after_stopwords:
    for punct_to_change in punctuation:
        sentence = sentence.replace(punct_to_change,"")
    sentences_after_stopwords_puncts.append(sentence)

In [10]:
sentences_after_stopwords_puncts_lower = list()

for sentence in sentences_after_stopwords_puncts:
    sentence = sentence.lower()
    sentences_after_stopwords_puncts_lower.append(sentence)

In [11]:
# sentences_after_stopwords_puncts = list()

# for sentence in sentences_after_stopwords:
#     new_sentence = list()
#     words_from_sentence = token_punct.tokenize(sentence)
#     # print(words_from_sentence)
#     for word in words_from_sentence:
#         if word not in puncts:
#             new_sentence.append(word)
#     # print(new_sentence)
#     sentences_after_stopwords_puncts.append(" ".join(new_sentence))

In [12]:
stop_words_no_accent = list()
for word in stop_words:
    for punct_to_change in punctuation:
        word = word.replace(punct_to_change,"")
    stop_words_no_accent.append(word)

In [13]:
sentences_after_stopwords_puncts_lower_stopwords = list()

for sentence in sentences_after_stopwords_puncts_lower:
    new_sentence = list()
    words_from_sentence = token_space.tokenize(sentence)
    for word in words_from_sentence:
        if word not in stop_words_no_accent:
            new_sentence.append(word)
    sentences_after_stopwords_puncts_lower_stopwords.append(" ".join(new_sentence))

In [14]:
sentences_after_stopwords_puncts_lower_stopwords_number = list()

for sentence in sentences_after_stopwords_puncts_lower_stopwords:
    new_sentence = list()
    words_from_sentence = token_space.tokenize(sentence)
    for word in words_from_sentence:
        if not word.isnumeric():
            new_sentence.append(word)
        else:
            new_sentence.append("0")
    sentences_after_stopwords_puncts_lower_stopwords_number.append(" ".join(new_sentence))

Resampling

In [15]:
clean_df = pd.DataFrame({'sentences':sentences_after_stopwords_puncts_lower_stopwords_number,'output':df.output})

In [16]:
from imblearn.over_sampling import SMOTE
import numpy as np
smote = SMOTE(random_state=100)

In [17]:
tfidf = TfidfVectorizer(lowercase=False,max_features=600)
vector_tfidf = tfidf.fit_transform(sentences_after_stopwords_puncts_lower_stopwords_number)

In [18]:
X_to_be_transformed = vector_tfidf
Y_to_be_transformed = clean_df.output
X_resampled, Y_resampled = smote.fit_resample(X_to_be_transformed,Y_to_be_transformed)

Y_resampled.value_counts()

negative    2879
positive    2879
neutral     2879
Name: output, dtype: int64

In [19]:
X_train,X_test,Y_train,Y_test = train_test_split(X_resampled,Y_resampled,random_state = 100,test_size=0.1)

In [45]:
logistic_regression = LogisticRegression(solver = 'lbfgs')

In [46]:
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7360824742268042

LogisticRegression Model

In [21]:
logistic_regression = LogisticRegression(solver = 'lbfgs')

In [22]:
X_train,X_test,Y_train,Y_test = train_test_split(vector_tfidf,df.output,random_state = 100,test_size=0.1)

In [23]:
# Score using all data without any transformation
logistic_regression.fit(X_train,Y_train)
logistic_regression.score(X_test,Y_test)

0.7360824742268042

In [24]:
Y_predicted = list(logistic_regression.predict(X_test))
Y_test_list = list(Y_test)

cont = 0
for i in range(len(Y_test)):
    if Y_test_list[i] == Y_predicted[i]:
        cont += 1
    
print("Score:",cont/len(Y_test))

Score: 0.7360824742268042


In [25]:
# all_words = ' '.join(sentences_after_stopwords_puncts_lower_stopwords_number)
# cloud_of_words = WordCloud().generate(all_words)

In [26]:
# plt.figure(figsize=(10,8))
# plt.imshow(cloud_of_words, interpolation='bilinear')
# plt.show()

In [27]:
print(logistic_regression.predict(X_test[0]))
print(logistic_regression.predict_proba(X_test[0]))
print(Y_test.iloc[0])

['neutral']
[[0.08702322 0.61726828 0.2957085 ]]
neutral


In [28]:
logistic_regression.predict_proba(X_test)

array([[0.08702322, 0.61726828, 0.2957085 ],
       [0.17374482, 0.50210493, 0.32415025],
       [0.03438222, 0.85375034, 0.11186744],
       ...,
       [0.05891035, 0.48268009, 0.45840955],
       [0.39217478, 0.30133751, 0.30648771],
       [0.28863132, 0.3628201 , 0.34854858]])

In [29]:
logistic_regression.classes_

array(['negative', 'neutral', 'positive'], dtype=object)

LogisticRegression Testing

In [30]:
prob_class = list(logistic_regression.predict_proba(X_test))

cont_80 = 0
cont = 0
new_column = []

for line in prob_class:
    if line.max() >= 0.800:
        position = list(line).index(line.max())
        classification = logistic_regression.classes_[position]
        cont_80 += 1
        new_column.append(classification)
    else:
        new_column.append('Under 0.8')
        

In [31]:
Y_predicted = list(logistic_regression.predict(X_test))
Y_test_list = list(Y_test)

cont = 0
for i in range(len(Y_test)):
    if Y_test_list[i] == new_column[i]:
        cont += 1
    
print("Qnt:",cont_80)
print("Score:",cont/cont_80)

Qnt: 161
Score: 0.9006211180124224


LogisticRegression Analysing External Phrases

In [32]:
sentence_to_transform = ["The market is bad"]


In [33]:
sentences_after_stopwords = list()

for sentence in sentence_to_transform:
    new_sentence = list()
    words_from_sentence = token_space.tokenize(sentence)
    for word in words_from_sentence:
        if word not in stop_words:
            new_sentence.append(word)
    sentences_after_stopwords.append(" ".join(new_sentence))

In [34]:
sentences_after_stopwords_puncts = list()

for sentence in sentences_after_stopwords:
    for punct_to_change in punctuation:
        sentence = sentence.replace(punct_to_change,"")
    sentences_after_stopwords_puncts.append(sentence)

In [35]:
sentences_after_stopwords_puncts_lower = list()

for sentence in sentences_after_stopwords_puncts:
    sentence = sentence.lower()
    sentences_after_stopwords_puncts_lower.append(sentence)

In [36]:
sentences_after_stopwords_puncts_lower_stopwords = list()

for sentence in sentences_after_stopwords_puncts_lower:
    new_sentence = list()
    words_from_sentence = token_space.tokenize(sentence)
    for word in words_from_sentence:
        if word not in stop_words_no_accent:
            new_sentence.append(word)
    sentences_after_stopwords_puncts_lower_stopwords.append(" ".join(new_sentence))

In [37]:
sentences_after_stopwords_puncts_lower_stopwords_number = list()

for sentence in sentences_after_stopwords_puncts_lower_stopwords:
    new_sentence = list()
    words_from_sentence = token_space.tokenize(sentence)
    for word in words_from_sentence:
        if not word.isnumeric():
            new_sentence.append(word)
        else:
            new_sentence.append("0")
    sentences_after_stopwords_puncts_lower_stopwords_number.append(" ".join(new_sentence))

In [38]:
print(sentences_after_stopwords_puncts_lower_stopwords_number)
vector_tfidf_sentence = tfidf.transform(sentences_after_stopwords_puncts_lower_stopwords_number)

['market bad']


In [39]:
print(logistic_regression.predict(vector_tfidf_sentence))
print(logistic_regression.predict_proba(vector_tfidf_sentence))

['neutral']
[[0.09063359 0.72584311 0.1835233 ]]


Creating a Function to transform the sentence

In [40]:
def transformSentence(list_of_sentences):
    
    sentences_after_stopwords = list()
    
    for sentence in list_of_sentences:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if word not in stop_words:
                new_sentence.append(word)
        sentences_after_stopwords.append(" ".join(new_sentence))

    sentences_after_stopwords_puncts = list()

    for sentence in sentences_after_stopwords:
        for punct_to_change in punctuation:
            sentence = sentence.replace(punct_to_change,"")
        sentences_after_stopwords_puncts.append(sentence)

    sentences_after_stopwords_puncts_lower = list()

    for sentence in sentences_after_stopwords_puncts:
        sentence = sentence.lower()
        sentences_after_stopwords_puncts_lower.append(sentence)

    stop_words_no_accent = list()

    for word in stop_words:
        for punct_to_change in punctuation:
            word = word.replace(punct_to_change,"")
        stop_words_no_accent.append(word)
    
    sentences_after_stopwords_puncts_lower_stopwords = list()

    for sentence in sentences_after_stopwords_puncts_lower:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if word not in stop_words_no_accent:
                new_sentence.append(word)
        sentences_after_stopwords_puncts_lower_stopwords.append(" ".join(new_sentence))

    sentences_after_stopwords_puncts_lower_stopwords_number = list()

    for sentence in sentences_after_stopwords_puncts_lower_stopwords:
        new_sentence = list()
        words_from_sentence = token_space.tokenize(sentence)
        for word in words_from_sentence:
            if not word.isnumeric():
                new_sentence.append(word)
            else:
                new_sentence.append("0")
        sentences_after_stopwords_puncts_lower_stopwords_number.append(" ".join(new_sentence))

    return sentences_after_stopwords_puncts_lower_stopwords_number

In [44]:
transformSentence(["i love this movie!!"])

['love movie']