In [None]:
# Mount google drive. Make sure you have datasets at root directory in gdrive
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import numpy as np
import json
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import nltk
from nltk.corpus import stopwords
import string
from tqdm import tqdm
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanhubley/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def preprocess_func(sentences):
  """ 
  args: sentences => List of sentences that need to be preprocessed
  returns: cleaned_sentences => List of cleaned sentences that are lower-case, punctuation removed, and stopwords removed"""
  meaninglessWords=list(stopwords.words('english'))
  cleaned_sentences = []
  sents = sentences
  for i in range(len(sents)):
    sent = sents[i]
    sent = sent.lower() #convert to lowercase
    for char in sent:
      if char in string.punctuation:
        sent=sent.replace(char,"") #remove punctutation
    words = sent.split()
    sent = []
    for word in words:
      if word not in meaninglessWords:
        sent.append(word)
    sent = " ".join(sent) #remove stopwords
    cleaned_sentences.append(sent)
  return cleaned_sentences

In [4]:
# Loading in review datasets. The txt files hav the sentence and the label, separated by a tab (\t)

# For google colab
#amazon_df = pd.read_csv('/content/gdrive/My Drive/amazon_cells_labelled.txt', sep='\t', header=None)
#yelp_df = pd.read_csv('/content/gdrive/My Drive/yelp_labelled.txt', sep='\t', header=None)
#imdb_df = pd.read_csv('/content/gdrive/My Drive/imdb_labelled.txt', sep='\t', header=None)

# For local
amazon_df = pd.read_csv('./amazon_cells_labelled.txt', sep='\t', header=None)
yelp_df = pd.read_csv('./yelp_labelled.txt', sep='\t', header=None)
imdb_df = pd.read_csv('./imdb_labelled.txt', sep='\t', header=None)


# Cleaning and concatenating reviews
cleaned_sentencesAmazon = preprocess_func(list(amazon_df[0]))
cleaned_sentencesYelp = preprocess_func(list(yelp_df[0]))
cleaned_sentencesIMDB = preprocess_func(list(imdb_df[0]))
amazon_df['Cleaned Sentences'] = cleaned_sentencesAmazon
yelp_df['Cleaned Sentences'] = cleaned_sentencesYelp
imdb_df['Cleaned Sentences'] = cleaned_sentencesIMDB
amazon_df.columns = ['Sentence', 'Label', 'Cleaned Sentences']
amazon_df = amazon_df[['Sentence', 'Cleaned Sentences', 'Label']]
yelp_df.columns = ['Sentence', 'Label', 'Cleaned Sentences']
yelp_df = yelp_df[['Sentence', 'Cleaned Sentences', 'Label']]
imdb_df.columns = ['Sentence', 'Label', 'Cleaned Sentences']
imdb_df = imdb_df[['Sentence', 'Cleaned Sentences', 'Label']]
amazon_df = amazon_df.drop(labels='Sentence', axis=1)
amazon_df.columns=['Sentence', 'Label']
yelp_df = yelp_df.drop(labels='Sentence', axis=1)
yelp_df.columns=['Sentence', 'Label']
imdb_df = imdb_df.drop(labels='Sentence', axis=1)
imdb_df.columns = ['Sentence', 'Label']

df_reviews = pd.concat([amazon_df, yelp_df, imdb_df])
df_reviews.columns = ['Sentence', 'Label']

# Loading in sentiment140 dataset. In this dataset there are 3 labels (0 = negative, 2 = neutral, 4 = positive)
#df_tweets = pd.read_csv('/content/gdrive/My Drive/sentiment140.csv', encoding='latin-1', header=None)
df_tweets = pd.read_csv('./sentiment140.csv', encoding='latin-1', header=None)
df_tweets = df_tweets.drop(columns=[1,2,3,4])
df_tweets.columns = ['Label', 'Sentence']
df_tweets.drop(df_tweets[df_tweets['Label']==2].index, inplace=True)
df_tweets['Label'] = df_tweets['Label'].replace(4,1)

# Tweet cleaning
meaninglessWords=list(stopwords.words('english'))

# Remove @'s
df_tweets["Sentence"] = df_tweets["Sentence"].str.replace("(\@\w+.*?)","")
# Remove links
df_tweets["Sentence"] = df_tweets["Sentence"].str.replace(r"http\S+","")
# To lowercase
df_tweets["Sentence"] = df_tweets["Sentence"].str.lower()
# Remove stopwords
df_tweets["Sentence"] = df_tweets["Sentence"].apply(lambda x: " ".join([item for item in x.split() if item not in meaninglessWords]))
# Remove punctuation
df_tweets["Sentence"] = df_tweets["Sentence"].apply(lambda x: "".join([item for item in x if item not in string.punctuation]))

In [5]:
# Data examples in reviews dataset
df_reviews.head(3)

Unnamed: 0,Sentence,Label
0,way plug us unless go converter,0
1,good case excellent value,1
2,great jawbone,1


In [6]:
# Data examples in tweet dataset
df_tweets.head(3)

Unnamed: 0,Label,Sentence
0,0,awww thats bummer shoulda got david carr thir...
1,0,upset cant update facebook texting it might cr...
2,0,dived many times ball managed save 50 rest go ...


In [7]:
# Unigram NB trained with reviews tested on tweets
unigram_NB = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

unigram_NB.fit(df_reviews['Sentence'], df_reviews['Label'])
predicted = unigram_NB.predict(df_tweets['Sentence'])
np.mean(predicted == df_tweets['Label'])

0.60825625

In [8]:
# Unigram NB trained with tweets tested on reviews
unigram_NB = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

# Reduced train size to comparable amount of reviews that we have
x_train, x_test, y_train, y_test = train_test_split(df_tweets['Sentence'], df_tweets['Label'], test_size=0.9981)

unigram_NB.fit(x_train, y_train)
predicted = unigram_NB.predict(df_reviews['Sentence'])
np.mean(predicted == df_reviews['Label'])

0.651018922852984

In [9]:
# Unigram and Bigram NB trained with reviews tested on tweets
uniAndBigram_NB = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

uniAndBigram_NB.fit(df_reviews['Sentence'], df_reviews['Label'])
predicted = uniAndBigram_NB.predict(df_tweets['Sentence'])
np.mean(predicted == df_tweets['Label'])

0.610893125

In [11]:
# Unigram and Bigram NB trained with tweets and tested on reviews
uniAndBigram_NB = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

x_train, x_test, y_train, y_test = train_test_split(df_tweets['Sentence'], df_tweets['Label'], test_size=0.9981)

uniAndBigram_NB.fit(x_train, y_train)
predicted = uniAndBigram_NB.predict(df_reviews['Sentence'])
np.mean(predicted == df_reviews['Label'])

0.6786754002911208

In [12]:
# Unigram, Bigram, and Trigram NB trained with reviews tested on tweets
ubtGram_NB = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

ubtGram_NB.fit(df_reviews['Sentence'], df_reviews['Label'])
predicted = ubtGram_NB.predict(df_tweets['Sentence'])
np.mean(predicted == df_tweets['Label'])

0.610946875

In [13]:
# Unigram, Bigram, and Trigram NB trained with tweets and tested on reviews
ubtGram_NB = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', MultinomialNB()),
 ])

x_train, x_test, y_train, y_test = train_test_split(df_tweets['Sentence'], df_tweets['Label'], test_size=0.9981)

ubtGram_NB.fit(x_train, y_train)
predicted = ubtGram_NB.predict(df_reviews['Sentence'])
np.mean(predicted == df_reviews['Label'])

0.6750363901018923

In [14]:
# Unigram SVM trained with reviews tested on tweets
unigram_SVM = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier()),
 ])

unigram_SVM.fit(df_reviews['Sentence'], df_reviews['Label'])
predicted = unigram_SVM.predict(df_tweets['Sentence'])
np.mean(predicted == df_tweets['Label'])

0.586419375

In [15]:
# Unigram SVM trained with tweets tested on reviews
unigram_SVM = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier()),
 ])

# Reduced train size to comparable amount of reviews that we have
x_train, x_test, y_train, y_test = train_test_split(df_tweets['Sentence'], df_tweets['Label'], test_size=0.9981)

unigram_SVM.fit(x_train, y_train)
predicted = unigram_SVM.predict(df_reviews['Sentence'])
np.mean(predicted == df_reviews['Label'])

0.6095342066957787

In [16]:
# Unigram and Bigram SVM trained with reviews tested on tweets
uniAndBigram_SVM = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier()),
 ])

uniAndBigram_SVM.fit(df_reviews['Sentence'], df_reviews['Label'])
predicted = uniAndBigram_SVM.predict(df_tweets['Sentence'])
np.mean(predicted == df_tweets['Label'])

0.59957

In [17]:
# Unigram and Bigram NB trained with tweets and tested on reviews
uniAndBigram_SVM = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,2))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier()),
 ])

x_train, x_test, y_train, y_test = train_test_split(df_tweets['Sentence'], df_tweets['Label'], test_size=0.9981)

uniAndBigram_SVM.fit(x_train, y_train)
predicted = uniAndBigram_SVM.predict(df_reviews['Sentence'])
np.mean(predicted == df_reviews['Label'])

0.6706695778748181

In [None]:
# Unigram, Bigram, and Trigram NB trained with reviews tested on tweets
ubtGram_SVM = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier()),
 ])

ubtGram_SVM.fit(df_reviews['Sentence'], df_reviews['Label'])
predicted = ubtGram_SVM.predict(df_tweets['Sentence'])
np.mean(predicted == df_tweets['Label'])

In [None]:
# Unigram, Bigram, and Trigram NB trained with tweets and tested on reviews
ubtGram_SVM = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,3))),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier()),
 ])

x_train, x_test, y_train, y_test = train_test_split(df_tweets['Sentence'], df_tweets['Label'], test_size=0.9981)

ubtGram_SVM.fit(x_train, y_train)
predicted = ubtGram_SVM.predict(df_reviews['Sentence'])
np.mean(predicted == df_reviews['Label'])