In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import preprocessor as p
from gensim.parsing.preprocessing import remove_stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import string
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re

%matplotlib inline

[nltk_data] Downloading package punkt to /Users/shehzad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/shehzad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
with open('data/train.csv') as f:
    full = pd.read_csv(f)

with open('data/test.csv') as f:
    test = pd.read_csv(f)

full = full.take(np.random.permutation(len(full)))

SCALE_DATASET_SIZE = .7
TRAIN_SUBSET = 1.0

train_amount = int(len(full) * TRAIN_SUBSET * SCALE_DATASET_SIZE)

train = full.take(range(0, train_amount))
test = test

print(len(full), len(train), len(test))
train.head()

1048575 734002 359


Unnamed: 0,Index,Sentiment,Text
85735,85735,0,@Tiffanta I already did that this morning Mc...
754136,754136,0,@TravelTweetie I hope you don't mind me asking...
977134,977134,1,@skovi nice. will let you about our next spir...
310783,310783,0,@graceobrien aww. if it makes you feel any be...
291170,291170,0,@stephlove23 crazzy girl I wish I had that app...


In [16]:
def clean_data(df):
    def self_clean(text):
        text = text.strip().lower()
        text = ' '.join([w for w in text.split() if len(w) > 1 and not w.startswith('http') and not w.startswith('@')])
        
        text = text.translate(str.maketrans('', '', string.punctuation))

        text = remove_stopwords(text)

        return text


    df['clean_text'] = df['Text'].apply(p.clean).apply(self_clean)

    # Tokenize, then lemmatize, then untokenize?
    df['clean_text'] = df['clean_text'].apply(word_tokenize).apply(lambda x: [WordNetLemmatizer().lemmatize(y) for y in x]).apply(lambda x: ' '.join(x))

    
    df.head(5)
    return df

train = clean_data(train)
test = clean_data(test)
train.head(5)


Unnamed: 0,Index,Sentiment,Text,clean_text
85735,85735,0,@Tiffanta I already did that this morning Mc...,morning mcfly chance win
754136,754136,0,@TravelTweetie I hope you don't mind me asking...,hope dont mind asking bring quotwould beenquot...
977134,977134,1,@skovi nice. will let you about our next spir...,nice let spiritualwarriorscom screening
310783,310783,0,@graceobrien aww. if it makes you feel any be...,aww make feel better movie legally yay
291170,291170,0,@stephlove23 crazzy girl I wish I had that app...,crazzy girl wish app


In [22]:
bow_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

# fit the vectorizer to the training data and transform the text data into each representation
X_train_bow = bow_vectorizer.fit_transform(train['clean_text'])
X_train_tfidf = tfidf_vectorizer.fit_transform(train['clean_text'])

# transform the testing data into a bag-of-words representation using the same vectorizer
X_test_bow = bow_vectorizer.transform(test['clean_text'])
X_test_tfidf = tfidf_vectorizer.transform(test['clean_text'])

# print the vocabulary (i.e., the set of unique words) in the BoW representation
print("Vocabulary size:", len(bow_vectorizer.vocabulary_))
print("Vocabulary:", list(bow_vectorizer.vocabulary_.keys())[:10])

# print the BoW representation of the first training example
print("BoW representation of the first training example:")
print(X_train_bow[0])


Vocabulary size: 230914
Vocabulary: ['morning', 'mcfly', 'chance', 'win', 'hope', 'dont', 'mind', 'asking', 'bring', 'quotwould']
BoW representation of the first training example:
  (0, 125780)	1
  (0, 118976)	1
  (0, 34305)	1
  (0, 221517)	1


In [24]:
# create a logistic regression object
lr = LogisticRegression()

# fit the model to the training data
lr.fit(X_train_bow, train['Sentiment'])

# predict the sentiment labels for the testing data
y_pred = lr.predict(X_test_bow)

# compute the accuracy of the predictions
accuracy = accuracy_score(test['Sentiment'], y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 76.60%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
from sklearn.linear_model import LogisticRegression

# create a logistic regression object
lr = LogisticRegression()

# fit the model to the training data
lr.fit(X_train_tfidf, train['Sentiment'])

# predict the sentiment labels for the testing data
y_pred = lr.predict(X_test_tfidf)

# compute the accuracy of the predictions
accuracy = accuracy_score(test['Sentiment'], y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 74.65%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
from sklearn.naive_bayes import MultinomialNB

# create a naive bayes object
nb = MultinomialNB()

# fit the model to the training data
nb.fit(X_train_bow, train['Sentiment'])

# predict the sentiment labels for the testing data
y_pred = nb.predict(X_test_bow)

# compute the accuracy of the predictions
accuracy = accuracy_score(test['Sentiment'], y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 71.87%


In [27]:
from sklearn.naive_bayes import MultinomialNB

# create a naive bayes object
nb = MultinomialNB()

# fit the model to the training data
nb.fit(X_train_tfidf, train['Sentiment'])

# predict the sentiment labels for the testing data
y_pred = nb.predict(X_test_tfidf)

# compute the accuracy of the predictions
accuracy = accuracy_score(test['Sentiment'], y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 51.81%
